[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - add AVX512 VPTERNLOGD/VPTERNLOGQ intrinsics to be used in constexpr#158703
Conversation
|
@llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-x86 Author: Shawn (kimsh02) ChangesFix #157698 Add handling for Patch is 35.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/158703.diff 5 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 1a8645f99e281..a4335256f6deb 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -2398,28 +2398,36 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
def psraq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>)">;
def psrld512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>)">;
def psrlq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512f",
+ Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def pternlogd512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
def pternlogd512_maskz : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
def pternlogq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
def pternlogq512_maskz : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
}
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl",
+ Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def pternlogd128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
def pternlogd128_maskz : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
}
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl",
+ Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def pternlogd256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
def pternlogd256_maskz : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
}
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl",
+ Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def pternlogq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
def pternlogq128_maskz : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
}
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl",
+ Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def pternlogq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
def pternlogq256_maskz : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
}
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 4461731c25648..4004e9fca86b8 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2903,6 +2903,55 @@ static bool interp__builtin_elementwise_triop(
return true;
}
+static bool interp__builtin_pternlog(InterpState &S, CodePtr OpPC,
+ const CallExpr *Call, bool MaskZ) {
+ assert(Call->getNumArgs() == 5);
+
+ const VectorType *VecT = Call->getArg(0)->getType()->castAs<VectorType>();
+ unsigned DstLen = VecT->getNumElements();
+ PrimType DstElemT = *S.getContext().classify(VecT->getElementType());
+
+ APSInt U = popToAPSInt(S.Stk, *S.getContext().classify(Call->getArg(4)));
+ APSInt Imm = popToAPSInt(S.Stk, *S.getContext().classify(Call->getArg(3)));
+ const Pointer &C = S.Stk.pop<Pointer>();
+ const Pointer &B = S.Stk.pop<Pointer>();
+ const Pointer &A = S.Stk.pop<Pointer>();
+
+ const Pointer &Dst = S.Stk.peek<Pointer>();
+
+ for (unsigned I = 0; I < DstLen; ++I) {
+ APSInt a, b, c;
+ INT_TYPE_SWITCH(DstElemT, {
+ a = A.elem<T>(I).toAPSInt();
+ b = B.elem<T>(I).toAPSInt();
+ c = C.elem<T>(I).toAPSInt();
+ });
+
+ unsigned BitWidth = a.getBitWidth();
+ APInt R(BitWidth, 0);
+ bool DstUnsigned = a.isUnsigned();
+
+ if (U[I]) {
+ for (unsigned Bit = 0; Bit < BitWidth; ++Bit) {
+ unsigned Idx = (a[Bit] << 2) | (b[Bit] << 1) | (c[Bit]);
+ R.setBitVal(Bit, Imm[Idx]);
+ }
+ INT_TYPE_SWITCH_NO_BOOL(DstElemT, {
+ Dst.elem<T>(I) = static_cast<T>(APSInt(R, DstUnsigned));
+ });
+ } else if (MaskZ) {
+ INT_TYPE_SWITCH_NO_BOOL(DstElemT, {
+ Dst.elem<T>(I) = static_cast<T>(APSInt(R, DstUnsigned));
+ });
+ } else {
+ INT_TYPE_SWITCH_NO_BOOL(DstElemT,
+ { Dst.elem<T>(I) = static_cast<T>(a); });
+ }
+ }
+ Dst.initializeAllElements();
+ return true;
+}
+
bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
uint32_t BuiltinID) {
if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
@@ -3543,7 +3592,20 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case X86::BI__builtin_ia32_selectpd_256:
case X86::BI__builtin_ia32_selectpd_512:
return interp__builtin_select(S, OpPC, Call);
-
+ case X86::BI__builtin_ia32_pternlogd128_mask:
+ case X86::BI__builtin_ia32_pternlogd256_mask:
+ case X86::BI__builtin_ia32_pternlogd512_mask:
+ case X86::BI__builtin_ia32_pternlogq128_mask:
+ case X86::BI__builtin_ia32_pternlogq256_mask:
+ case X86::BI__builtin_ia32_pternlogq512_mask:
+ return interp__builtin_pternlog(S, OpPC, Call, false);
+ case X86::BI__builtin_ia32_pternlogd128_maskz:
+ case X86::BI__builtin_ia32_pternlogd256_maskz:
+ case X86::BI__builtin_ia32_pternlogd512_maskz:
+ case X86::BI__builtin_ia32_pternlogq128_maskz:
+ case X86::BI__builtin_ia32_pternlogq256_maskz:
+ case X86::BI__builtin_ia32_pternlogq512_maskz:
+ return interp__builtin_pternlog(S, OpPC, Call, true);
case Builtin::BI__builtin_elementwise_fshl:
return interp__builtin_elementwise_triop(S, OpPC, Call,
llvm::APIntOps::fshl);
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 5145896930153..1f063308aeca5 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11997,6 +11997,96 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
+ case X86::BI__builtin_ia32_pternlogd128_mask:
+ case X86::BI__builtin_ia32_pternlogd256_mask:
+ case X86::BI__builtin_ia32_pternlogd512_mask:
+ case X86::BI__builtin_ia32_pternlogq128_mask:
+ case X86::BI__builtin_ia32_pternlogq256_mask:
+ case X86::BI__builtin_ia32_pternlogq512_mask: {
+ APValue AValue, BValue, CValue, ImmValue, UValue;
+ if (!EvaluateAsRValue(Info, E->getArg(0), AValue) ||
+ !EvaluateAsRValue(Info, E->getArg(1), BValue) ||
+ !EvaluateAsRValue(Info, E->getArg(2), CValue) ||
+ !EvaluateAsRValue(Info, E->getArg(3), ImmValue) ||
+ !EvaluateAsRValue(Info, E->getArg(4), UValue))
+ return false;
+
+ QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+ bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
+ APInt Imm = ImmValue.getInt();
+ APInt U = UValue.getInt();
+ unsigned ResultLen = AValue.getVectorLength();
+ SmallVector<APValue, 16> ResultElements;
+ ResultElements.reserve(ResultLen);
+
+ for (unsigned EltNum = 0; EltNum < ResultLen; ++EltNum) {
+ APInt ALane = AValue.getVectorElt(EltNum).getInt();
+ APInt BLane = BValue.getVectorElt(EltNum).getInt();
+ APInt CLane = CValue.getVectorElt(EltNum).getInt();
+
+ if (U[EltNum]) {
+ unsigned BitWidth = ALane.getBitWidth();
+ APInt ResLane(BitWidth, 0);
+
+ for (unsigned Bit = 0; Bit < BitWidth; ++Bit) {
+ unsigned ABit = ALane[Bit];
+ unsigned BBit = BLane[Bit];
+ unsigned CBit = CLane[Bit];
+
+ unsigned Idx = (ABit << 2) | (BBit << 1) | CBit;
+ ResLane.setBitVal(Bit, Imm[Idx]);
+ }
+ ResultElements.push_back(APValue(APSInt(ResLane, DestUnsigned)));
+ } else {
+ ResultElements.push_back(APValue(APSInt(ALane, DestUnsigned)));
+ }
+ }
+ return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+ }
+ case X86::BI__builtin_ia32_pternlogd128_maskz:
+ case X86::BI__builtin_ia32_pternlogd256_maskz:
+ case X86::BI__builtin_ia32_pternlogd512_maskz:
+ case X86::BI__builtin_ia32_pternlogq128_maskz:
+ case X86::BI__builtin_ia32_pternlogq256_maskz:
+ case X86::BI__builtin_ia32_pternlogq512_maskz: {
+ APValue AValue, BValue, CValue, ImmValue, UValue;
+ if (!EvaluateAsRValue(Info, E->getArg(0), AValue) ||
+ !EvaluateAsRValue(Info, E->getArg(1), BValue) ||
+ !EvaluateAsRValue(Info, E->getArg(2), CValue) ||
+ !EvaluateAsRValue(Info, E->getArg(3), ImmValue) ||
+ !EvaluateAsRValue(Info, E->getArg(4), UValue))
+ return false;
+
+ QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+ bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
+ APInt Imm = ImmValue.getInt();
+ APInt U = UValue.getInt();
+ unsigned ResultLen = AValue.getVectorLength();
+ SmallVector<APValue, 16> ResultElements;
+ ResultElements.reserve(ResultLen);
+
+ for (unsigned EltNum = 0; EltNum < ResultLen; ++EltNum) {
+ APInt ALane = AValue.getVectorElt(EltNum).getInt();
+ APInt BLane = BValue.getVectorElt(EltNum).getInt();
+ APInt CLane = CValue.getVectorElt(EltNum).getInt();
+
+ unsigned BitWidth = ALane.getBitWidth();
+ APInt ResLane(BitWidth, 0);
+
+ if (U[EltNum]) {
+ for (unsigned Bit = 0; Bit < BitWidth; ++Bit) {
+ unsigned ABit = ALane[Bit];
+ unsigned BBit = BLane[Bit];
+ unsigned CBit = CLane[Bit];
+
+ unsigned Idx = (ABit << 2) | (BBit << 1) | CBit;
+ ResLane.setBitVal(Bit, Imm[Idx]);
+ }
+ }
+ ResultElements.push_back(APValue(APSInt(ResLane, DestUnsigned)));
+ }
+ return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+ }
case Builtin::BI__builtin_elementwise_ctlz:
case Builtin::BI__builtin_elementwise_cttz: {
APValue SourceLHS;
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index f93216e546a63..9138bdef7f92b 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -6206,6 +6206,27 @@ __m512i test_mm512_ternarylogic_epi32(__m512i __A, __m512i __B, __m512i __C) {
// CHECK: @llvm.x86.avx512.pternlog.d.512({{.*}}, i32 240)
return _mm512_ternarylogic_epi32(__A, __B, __C, _MM_TERNLOG_A);
}
+TEST_CONSTEXPR(match_v16si(
+ _mm512_ternarylogic_epi32(
+ ((__m512i)((__v16si){-0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0})),
+ ((__m512i)((__v16si){0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB})),
+ ((__m512i)((__v16si){0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC})),
+ (unsigned char)0xCA),
+ 0xB, 0xC, 0xB, 0xC, 0xB, 0xC, 0xB, 0xC, 0xB, 0xC, 0xB, 0xC, 0xB, 0xC, 0xB, 0xC));
+TEST_CONSTEXPR(match_v16si(
+ _mm512_ternarylogic_epi32(
+ ((__m512i)((__v16si){0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9})),
+ ((__m512i)((__v16si){0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4})),
+ ((__m512i)((__v16si){0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2})),
+ (unsigned char)0xFE),
+ 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF));
+TEST_CONSTEXPR(match_v16si(
+ _mm512_ternarylogic_epi32(
+ ((__m512i)((__v16si){0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9})),
+ ((__m512i)((__v16si){0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4})),
+ ((__m512i)((__v16si){0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2})),
+ (unsigned char)0x80),
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0));
__m512i test_mm512_mask_ternarylogic_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
// CHECK-LABEL: test_mm512_mask_ternarylogic_epi32
@@ -6213,6 +6234,30 @@ __m512i test_mm512_mask_ternarylogic_epi32(__m512i __A, __mmask16 __U, __m512i _
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
return _mm512_mask_ternarylogic_epi32(__A, __U, __B, __C, _MM_TERNLOG_B);
}
+TEST_CONSTEXPR(match_v16si(
+ _mm512_mask_ternarylogic_epi32(
+ ((__m512i)((__v16si){-0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0})),
+ (__mmask16)0x3333,
+ ((__m512i)((__v16si){0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB})),
+ ((__m512i)((__v16si){0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC})),
+ (unsigned char)0xCA),
+ 0xB, 0xC, -0x1, 0x0, 0xB, 0xC, -0x1, 0x0, 0xB, 0xC, -0x1, 0x0, 0xB, 0xC, -0x1, 0x0));
+TEST_CONSTEXPR(match_v16si(
+ _mm512_mask_ternarylogic_epi32(
+ ((__m512i)((__v16si){0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9})),
+ (__mmask16)0xCCCC,
+ ((__m512i)((__v16si){0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4})),
+ ((__m512i)((__v16si){0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2})),
+ (unsigned char)0xFE),
+ 0x9, 0x9, 0xF, 0xF, 0x9, 0x9, 0xF, 0xF, 0x9, 0x9, 0xF, 0xF, 0x9, 0x9, 0xF, 0xF));
+TEST_CONSTEXPR(match_v16si(
+ _mm512_mask_ternarylogic_epi32(
+ ((__m512i)((__v16si){0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9})),
+ (__mmask16)0x5555,
+ ((__m512i)((__v16si){0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4})),
+ ((__m512i)((__v16si){0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2})),
+ (unsigned char)0x80),
+ 0x0, 0x9, 0x0, 0x9, 0x0, 0x9, 0x0, 0x9, 0x0, 0x9, 0x0, 0x9, 0x0, 0x9, 0x0, 0x9));
__m512i test_mm512_maskz_ternarylogic_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
// CHECK-LABEL: test_mm512_maskz_ternarylogic_epi32
@@ -6220,12 +6265,57 @@ __m512i test_mm512_maskz_ternarylogic_epi32(__mmask16 __U, __m512i __A, __m512i
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> zeroinitializer
return _mm512_maskz_ternarylogic_epi32(__U, __A, __B, __C, _MM_TERNLOG_C);
}
+TEST_CONSTEXPR(match_v16si(
+ _mm512_maskz_ternarylogic_epi32(
+ (__mmask16)0x3333,
+ ((__m512i)((__v16si){-0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0})),
+ ((__m512i)((__v16si){0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB})),
+ ((__m512i)((__v16si){0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC})),
+ (unsigned char)0xCA),
+ 0xB, 0xC, 0x0, 0x0, 0xB, 0xC, 0x0, 0x0, 0xB, 0xC, 0x0, 0x0, 0xB, 0xC, 0x0, 0x0));
+TEST_CONSTEXPR(match_v16si(
+ _mm512_maskz_ternarylogic_epi32(
+ (__mmask16)0xCCCC,
+ ((__m512i)((__v16si){0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9})),
+ ((__m512i)((__v16si){0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4})),
+ ((__m512i)((__v16si){0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2})),
+ (unsigned char)0xFE),
+ 0x0, 0x0, 0xF, 0xF, 0x0, 0x0, 0xF, 0xF, 0x0, 0x0, 0xF, 0xF, 0x0, 0x0, 0xF, 0xF));
+TEST_CONSTEXPR(match_v16si(
+ _mm512_maskz_ternarylogic_epi32(
+ (__mmask16)0x5555,
+ ((__m512i)((__v16si){0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9})),
+ ((__m512i)((__v16si){0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4})),
+ ((__m512i)((__v16si){0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2})),
+ (unsigned char)0x80),
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0));
__m512i test_mm512_ternarylogic_epi64(__m512i __A, __m512i __B, __m512i __C) {
// CHECK-LABEL: test_mm512_ternarylogic_epi64
// CHECK: @llvm.x86.avx512.pternlog.q.512({{.*}}, i32 192)
return _mm512_ternarylogic_epi64(__A, __B, __C, _MM_TERNLOG_A & _MM_TERNLOG_B);
}
+TEST_CONSTEXPR(match_v8di(
+ _mm512_ternarylogic_epi64(
+ ((__m512i)((__v8di){-0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0})),
+ ((__m512i)((__v8di){0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB})),
+ ((__m512i)((__v8di){0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC})),
+ (unsigned char)0xCA),
+ 0xB, 0xC, 0xB, 0xC, 0xB, 0xC, 0xB, 0xC));
+TEST_CONSTEXPR(match_v8di(
+ _mm512_ternarylogic_epi64(
+ ((__m512i)((__v8di){0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9})),
+ ((__m512i)((__v8di){0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4})),
+ ((__m512i)((__v8di){0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2})),
+ (unsigned char)0xFE),
+ 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF));
+TEST_CONSTEXPR(match_v8di(
+ _mm512_ternarylogic_epi64(
+ ((__m512i)((__v8di){0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9})),
+ ((__m512i)((__v8di){0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4})),
+ ((__m512i)((__v8di){0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2})),
+ (unsigned char)0x80),
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0));
__m512i test_mm512_mask_ternarylogic_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C) {
// CHECK-LABEL: test_mm512_mask_ternarylogic_epi64
@@ -6233,6 +6323,30 @@ __m512i test_mm512_mask_ternarylogic_epi64(__m512i __A, __mmask8 __U, __m512i __
// CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
return _mm512_mask_ternarylogic_epi64(__A, __U, __B, __C, _MM_TERNLOG_B | _MM_TERNLOG_C);
}
+TEST_CONSTEXPR(match_v8di(
+ _mm512_mask_ternarylogic_epi64(
+ ((__m512i)((__v8di){-0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0})),
+ (__mmask8)0x33,
+ ((__m512i)((__v8di){0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB})),
+ ((__m512i)((__v8di){0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC})),
+ (unsigned char)0xCA),
+ 0xB, 0xC, -0x1, 0x0, 0xB, 0xC, -0x1, 0x0));
+TEST_CONSTEXPR(match_v8di(
+ _mm512_mask_ternarylogic_epi64(
+ ((__m512i)((__v8di){0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9})),
+ (__mmask8)0xCC,
+ ((__m512i)((__v8di){0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4})),
+ ((__m512i)((__v8di){0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2})),
+ (unsigned char)0xFE),
+ 0x9, 0x9, 0xF, 0xF, 0x9, 0x9, 0xF, 0xF));
+TEST_CONSTEXPR(match_v8di(
+ _mm512_mask_ternarylogic_epi64(
+ ((__m512i)((__v8di){0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9})),
+ (__mmask8)0x55,
+ ((__m512i)((__v8di){0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4})),
+ ((__m512i)((__v8di){0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2})),
+ (unsigned char)0x80),
+ 0x0, 0x9, 0x0, 0x9, 0x0, 0x9, 0x0, 0x9));
__m512i test_mm512_maskz_ternarylogic_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C) {
// CHECK-LABEL: test_mm512_maskz_ternarylogic_epi64
@@ -6240,6 +6354,30 @@ __m512i test_mm512_maskz_ternarylogic_epi64(__mmask8 __U, __m512i __A, __m512i _
// CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> zeroinitializer
return _mm512_maskz_ternarylogic_epi64(__U, __A, __B, __C, ~_MM_TERNLOG_A | (_MM_TERNLOG_B ^ _MM_TERNLOG_C));
}
+TEST_CONSTEXPR(match_v8di(
+ _mm512_maskz_ternarylogic_epi64(
+ (__mmask8)0x33,
+ ((__m512i)((__v8di){-0x1, 0x0, -0x1, 0x0, -0x...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
|
@phoebewang Sorry, accidentally requested your review, didn't mean to. |
RKSimon
left a comment
There was a problem hiding this comment.
please can you merge against trunk
|
I couldn't get clang-format to change any files, my recent push was just to rebase. |
|
Are you still stuck with a problem? I can have a look. |
|
@tbaederr I'm still working on it. Thanks for checking in! |
|
As an update, I made better test cases with varying ternary operations and values, but I was not able to debug the handling in the new evaluator. There are three tests which pass from the old evaluator but does not pass from the new evaluator, and I have commented these failing tests. What the failing tests have in common is that they hit the pternlogq512 path (512-bit, 64-bit lanes), although the new evaluator passes other tests that hit this path as well. What I have tried was reviewing my logic, and trying to replicate the code exactly as I had written it in the old evaluator (since the old evaluator is passing everything). @tbaederr If anyone could have a look, that would be great 😅 |
|
Can you rebase this first? The patch doesn't apply on |
Okay, will do. |
|
@tbaederr Is the branch fine now? |
|
Yes, thanks. |
|
Should work once #161506 is merged. |
❤️ |
51d6412 to
ae6265f
Compare
|
Better test coverage with help of LLM. |
Fix #157698
Add handling for
__builtin_ia32_pternlog[d/q][128/256/512]_mask[z]intrinsics toVectorExprEvaluator::VisitCallExprandInterpBuiltin.cppwith the corresponding test coverage: