Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions neon_intrinsics/advsimd.rst
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ Changes between next release and 2021Q2
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Fixed the guard macro for the base intrinsics.

* Correct usdot specification on AArch32.

List of Intrinsics
##################

Expand Down Expand Up @@ -26502,23 +26504,23 @@ Dot product
| uint8x8_t b, | 0 <= lane <= 1 | | | |
| const int lane) | | | | |
+----------------------------------+------------------------+------------------------------------+---------------------+---------------------------+
| .. code:: c | :: | :: | :: | ``A64`` |
| .. code:: c | :: | :: | :: | ``A32/A64`` |
| | | | | |
| int32x2_t vusdot_laneq_s32( | r -> Vd.2S | USDOT Vd.2S,Vn.8B,Vm.4B[lane] | Vd.2S -> result | |
| int32x2_t r, | a -> Vn.8B | | | |
| uint8x8_t a, | b -> Vm.4B | | | |
| int8x16_t b, | 0 <= lane <= 3 | | | |
| const int lane) | | | | |
+----------------------------------+------------------------+------------------------------------+---------------------+---------------------------+
| .. code:: c | :: | :: | :: | ``A64`` |
| .. code:: c | :: | :: | :: | ``A32/A64`` |
| | | | | |
| int32x2_t vsudot_laneq_s32( | r -> Vd.2S | SUDOT Vd.2S,Vn.8B,Vm.4B[lane] | Vd.2S -> result | |
| int32x2_t r, | a -> Vn.8B | | | |
| int8x8_t a, | b -> Vm.4B | | | |
| uint8x16_t b, | 0 <= lane <= 3 | | | |
| const int lane) | | | | |
+----------------------------------+------------------------+------------------------------------+---------------------+---------------------------+
| .. code:: c | :: | :: | :: | ``A64`` |
| .. code:: c | :: | :: | :: | ``A32/A64`` |
| | | | | |
| int32x4_t vusdotq_s32( | r -> Vd.4S | USDOT Vd.4S,Vn.16B,Vm.16B | Vd.4S -> result | |
| int32x4_t r, | a -> Vn.16B | | | |
Expand All @@ -26541,15 +26543,15 @@ Dot product
| uint8x8_t b, | 0 <= lane <= 1 | | | |
| const int lane) | | | | |
+----------------------------------+------------------------+------------------------------------+---------------------+---------------------------+
| .. code:: c | :: | :: | :: | ``A64`` |
| .. code:: c | :: | :: | :: | ``A32/A64`` |
| | | | | |
| int32x4_t vusdotq_laneq_s32( | r -> Vd.4S | USDOT Vd.4S,Vn.16B,Vm.4B[lane] | Vd.4S -> result | |
| int32x4_t r, | a -> Vn.16B | | | |
| uint8x16_t a, | b -> Vm.4B | | | |
| int8x16_t b, | 0 <= lane <= 3 | | | |
| const int lane) | | | | |
+----------------------------------+------------------------+------------------------------------+---------------------+---------------------------+
| .. code:: c | :: | :: | :: | ``A64`` |
| .. code:: c | :: | :: | :: | ``A32/A64`` |
| | | | | |
| int32x4_t vsudotq_laneq_s32( | r -> Vd.4S | SUDOT Vd.4S,Vn.16B,Vm.4B[lane] | Vd.4S -> result | |
| int32x4_t r, | a -> Vn.8B | | | |
Expand Down
2 changes: 2 additions & 0 deletions neon_intrinsics/advsimd.template.rst
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ Changes between next release and 2021Q2
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Fixed the guard macro for the base intrinsics.

* Correct usdot specification on AArch32.

List of Intrinsics
##################

Expand Down
10 changes: 5 additions & 5 deletions tools/intrinsic_db/advsimd.csv
Original file line number Diff line number Diff line change
Expand Up @@ -4258,13 +4258,13 @@ int32x4_t vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) r -> Vd.4S;a -> V
int32x2_t vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) r -> Vd.2S;a -> Vn.8B;b -> Vm.8B USDOT Vd.2S,Vn.8B,Vm.8B Vd.2S -> result A32/A64
int32x2_t vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b, __builtin_constant_p(lane)) r -> Vd.2S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 1 USDOT Vd.2S,Vn.8B,Vm.4B[lane] Vd.2S -> result A32/A64
int32x2_t vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b, __builtin_constant_p(lane)) r -> Vd.2S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 1 SUDOT Vd.2S,Vn.8B,Vm.4B[lane] Vd.2S -> result A32/A64
int32x2_t vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b, __builtin_constant_p(lane)) r -> Vd.2S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 3 USDOT Vd.2S,Vn.8B,Vm.4B[lane] Vd.2S -> result A64
int32x2_t vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b, __builtin_constant_p(lane)) r -> Vd.2S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 3 SUDOT Vd.2S,Vn.8B,Vm.4B[lane] Vd.2S -> result A64
int32x4_t vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) r -> Vd.4S;a -> Vn.16B;b -> Vm.16B USDOT Vd.4S,Vn.16B,Vm.16B Vd.4S -> result A64
int32x2_t vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b, __builtin_constant_p(lane)) r -> Vd.2S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 3 USDOT Vd.2S,Vn.8B,Vm.4B[lane] Vd.2S -> result A32/A64
int32x2_t vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b, __builtin_constant_p(lane)) r -> Vd.2S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 3 SUDOT Vd.2S,Vn.8B,Vm.4B[lane] Vd.2S -> result A32/A64
int32x4_t vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) r -> Vd.4S;a -> Vn.16B;b -> Vm.16B USDOT Vd.4S,Vn.16B,Vm.16B Vd.4S -> result A32/A64
int32x4_t vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b, __builtin_constant_p(lane)) r -> Vd.4S;a -> Vn.16B;b -> Vm.4B; 0 <= lane <= 1 USDOT Vd.4S,Vn.16B,Vm.4B[lane] Vd.4S -> result A32/A64
int32x4_t vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b, __builtin_constant_p(lane)) r -> Vd.4S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 1 SUDOT Vd.4S,Vn.16B,Vm.4B[lane] Vd.4S -> result A32/A64
int32x4_t vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b, __builtin_constant_p(lane)) r -> Vd.4S;a -> Vn.16B;b -> Vm.4B; 0 <= lane <= 3 USDOT Vd.4S,Vn.16B,Vm.4B[lane] Vd.4S -> result A64
int32x4_t vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b, __builtin_constant_p(lane)) r -> Vd.4S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 3 SUDOT Vd.4S,Vn.16B,Vm.4B[lane] Vd.4S -> result A64
int32x4_t vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b, __builtin_constant_p(lane)) r -> Vd.4S;a -> Vn.16B;b -> Vm.4B; 0 <= lane <= 3 USDOT Vd.4S,Vn.16B,Vm.4B[lane] Vd.4S -> result A32/A64
int32x4_t vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b, __builtin_constant_p(lane)) r -> Vd.4S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 3 SUDOT Vd.4S,Vn.16B,Vm.4B[lane] Vd.4S -> result A32/A64

<SECTION> Bfloat16 intrinsics Requires the +bf16 architecture extension.
bfloat16x4_t vcreate_bf16(uint64_t a) a -> Xn INS Vd.D[0],Xn Vd.4H -> result A32/A64
Expand Down