From 5a973ec3cfe768a3a60771af156c4457b6ad19b4 Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Wed, 21 Jul 2021 15:11:39 +0100 Subject: [PATCH] acle: Correct usdot specification on AArch32 --- neon_intrinsics/advsimd.rst | 12 +++++++----- neon_intrinsics/advsimd.template.rst | 2 ++ tools/intrinsic_db/advsimd.csv | 10 +++++----- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/neon_intrinsics/advsimd.rst b/neon_intrinsics/advsimd.rst index 60110baf..d9313522 100644 --- a/neon_intrinsics/advsimd.rst +++ b/neon_intrinsics/advsimd.rst @@ -151,6 +151,8 @@ Changes between next release and 2021Q2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * Fixed the guard macro for the base intrinsics. +* Correct usdot specification on AArch32. + List of Intrinsics ################## @@ -26502,7 +26504,7 @@ Dot product | uint8x8_t b, | 0 <= lane <= 1 | | | | | const int lane) | | | | | +----------------------------------+------------------------+------------------------------------+---------------------+---------------------------+ -| .. code:: c | :: | :: | :: | ``A64`` | +| .. code:: c | :: | :: | :: | ``A32/A64`` | | | | | | | | int32x2_t vusdot_laneq_s32( | r -> Vd.2S | USDOT Vd.2S,Vn.8B,Vm.4B[lane] | Vd.2S -> result | | | int32x2_t r, | a -> Vn.8B | | | | @@ -26510,7 +26512,7 @@ Dot product | int8x16_t b, | 0 <= lane <= 3 | | | | | const int lane) | | | | | +----------------------------------+------------------------+------------------------------------+---------------------+---------------------------+ -| .. code:: c | :: | :: | :: | ``A64`` | +| .. code:: c | :: | :: | :: | ``A32/A64`` | | | | | | | | int32x2_t vsudot_laneq_s32( | r -> Vd.2S | SUDOT Vd.2S,Vn.8B,Vm.4B[lane] | Vd.2S -> result | | | int32x2_t r, | a -> Vn.8B | | | | @@ -26518,7 +26520,7 @@ Dot product | uint8x16_t b, | 0 <= lane <= 3 | | | | | const int lane) | | | | | +----------------------------------+------------------------+------------------------------------+---------------------+---------------------------+ -| .. code:: c | :: | :: | :: | ``A64`` | +| .. code:: c | :: | :: | :: | ``A32/A64`` | | | | | | | | int32x4_t vusdotq_s32( | r -> Vd.4S | USDOT Vd.4S,Vn.16B,Vm.16B | Vd.4S -> result | | | int32x4_t r, | a -> Vn.16B | | | | @@ -26541,7 +26543,7 @@ Dot product | uint8x8_t b, | 0 <= lane <= 1 | | | | | const int lane) | | | | | +----------------------------------+------------------------+------------------------------------+---------------------+---------------------------+ -| .. code:: c | :: | :: | :: | ``A64`` | +| .. code:: c | :: | :: | :: | ``A32/A64`` | | | | | | | | int32x4_t vusdotq_laneq_s32( | r -> Vd.4S | USDOT Vd.4S,Vn.16B,Vm.4B[lane] | Vd.4S -> result | | | int32x4_t r, | a -> Vn.16B | | | | @@ -26549,7 +26551,7 @@ Dot product | int8x16_t b, | 0 <= lane <= 3 | | | | | const int lane) | | | | | +----------------------------------+------------------------+------------------------------------+---------------------+---------------------------+ -| .. code:: c | :: | :: | :: | ``A64`` | +| .. code:: c | :: | :: | :: | ``A32/A64`` | | | | | | | | int32x4_t vsudotq_laneq_s32( | r -> Vd.4S | SUDOT Vd.4S,Vn.16B,Vm.4B[lane] | Vd.4S -> result | | | int32x4_t r, | a -> Vn.8B | | | | diff --git a/neon_intrinsics/advsimd.template.rst b/neon_intrinsics/advsimd.template.rst index 3d098d98..61a9c8a4 100644 --- a/neon_intrinsics/advsimd.template.rst +++ b/neon_intrinsics/advsimd.template.rst @@ -151,6 +151,8 @@ Changes between next release and 2021Q2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * Fixed the guard macro for the base intrinsics. +* Correct usdot specification on AArch32. + List of Intrinsics ################## diff --git a/tools/intrinsic_db/advsimd.csv b/tools/intrinsic_db/advsimd.csv index c23bd948..44713f5b 100644 --- a/tools/intrinsic_db/advsimd.csv +++ b/tools/intrinsic_db/advsimd.csv @@ -4258,13 +4258,13 @@ int32x4_t vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) r -> Vd.4S;a -> V int32x2_t vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) r -> Vd.2S;a -> Vn.8B;b -> Vm.8B USDOT Vd.2S,Vn.8B,Vm.8B Vd.2S -> result A32/A64 int32x2_t vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b, __builtin_constant_p(lane)) r -> Vd.2S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 1 USDOT Vd.2S,Vn.8B,Vm.4B[lane] Vd.2S -> result A32/A64 int32x2_t vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b, __builtin_constant_p(lane)) r -> Vd.2S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 1 SUDOT Vd.2S,Vn.8B,Vm.4B[lane] Vd.2S -> result A32/A64 -int32x2_t vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b, __builtin_constant_p(lane)) r -> Vd.2S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 3 USDOT Vd.2S,Vn.8B,Vm.4B[lane] Vd.2S -> result A64 -int32x2_t vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b, __builtin_constant_p(lane)) r -> Vd.2S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 3 SUDOT Vd.2S,Vn.8B,Vm.4B[lane] Vd.2S -> result A64 -int32x4_t vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) r -> Vd.4S;a -> Vn.16B;b -> Vm.16B USDOT Vd.4S,Vn.16B,Vm.16B Vd.4S -> result A64 +int32x2_t vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b, __builtin_constant_p(lane)) r -> Vd.2S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 3 USDOT Vd.2S,Vn.8B,Vm.4B[lane] Vd.2S -> result A32/A64 +int32x2_t vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b, __builtin_constant_p(lane)) r -> Vd.2S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 3 SUDOT Vd.2S,Vn.8B,Vm.4B[lane] Vd.2S -> result A32/A64 +int32x4_t vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) r -> Vd.4S;a -> Vn.16B;b -> Vm.16B USDOT Vd.4S,Vn.16B,Vm.16B Vd.4S -> result A32/A64 int32x4_t vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b, __builtin_constant_p(lane)) r -> Vd.4S;a -> Vn.16B;b -> Vm.4B; 0 <= lane <= 1 USDOT Vd.4S,Vn.16B,Vm.4B[lane] Vd.4S -> result A32/A64 int32x4_t vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b, __builtin_constant_p(lane)) r -> Vd.4S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 1 SUDOT Vd.4S,Vn.16B,Vm.4B[lane] Vd.4S -> result A32/A64 -int32x4_t vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b, __builtin_constant_p(lane)) r -> Vd.4S;a -> Vn.16B;b -> Vm.4B; 0 <= lane <= 3 USDOT Vd.4S,Vn.16B,Vm.4B[lane] Vd.4S -> result A64 -int32x4_t vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b, __builtin_constant_p(lane)) r -> Vd.4S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 3 SUDOT Vd.4S,Vn.16B,Vm.4B[lane] Vd.4S -> result A64 +int32x4_t vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b, __builtin_constant_p(lane)) r -> Vd.4S;a -> Vn.16B;b -> Vm.4B; 0 <= lane <= 3 USDOT Vd.4S,Vn.16B,Vm.4B[lane] Vd.4S -> result A32/A64 +int32x4_t vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b, __builtin_constant_p(lane)) r -> Vd.4S;a -> Vn.8B;b -> Vm.4B; 0 <= lane <= 3 SUDOT Vd.4S,Vn.16B,Vm.4B[lane] Vd.4S -> result A32/A64
Bfloat16 intrinsics Requires the +bf16 architecture extension. bfloat16x4_t vcreate_bf16(uint64_t a) a -> Xn INS Vd.D[0],Xn Vd.4H -> result A32/A64