From af22395f4e960cec52436e61c6ecadda753271d1 Mon Sep 17 00:00:00 2001 From: Khushal Modi Date: Fri, 10 May 2024 20:18:35 -0700 Subject: [PATCH] Lower AVX10v1 hwintrinsic in lowering and gentree.cpp for simdSize 32/16 --- src/coreclr/jit/compiler.h | 3 +- src/coreclr/jit/gentree.cpp | 568 ++++++++++++++---- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 5 + src/coreclr/jit/hwintrinsiclistxarch.h | 170 +++--- src/coreclr/jit/hwintrinsicxarch.cpp | 50 +- src/coreclr/jit/importercalls.cpp | 177 +++++- src/coreclr/jit/importervectorization.cpp | 6 +- src/coreclr/jit/lowerxarch.cpp | 100 ++- src/coreclr/jit/morph.cpp | 1 + src/coreclr/jit/simdashwintrinsic.cpp | 56 +- .../GenerateHWIntrinsicTests_X86.cs | 24 +- 11 files changed, 936 insertions(+), 224 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 88b63c49ff8a43..155bbccd8b888f 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -9502,7 +9502,8 @@ class Compiler // bool canUseEvexEncoding() const { - return compOpportunisticallyDependsOn(InstructionSet_AVX512F); + return (compOpportunisticallyDependsOn(InstructionSet_AVX512F) || + compOpportunisticallyDependsOn(InstructionSet_AVX10v1)); } private: diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 5e2e8491f56567..7a40589975dcf2 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19726,6 +19726,8 @@ bool GenTree::isContainableHWIntrinsic() const case NI_AVX512BW_VL_ConvertToVector128SByteWithSaturation: case NI_AVX512DQ_ExtractVector128: case NI_AVX512DQ_ExtractVector256: + case NI_AVX10v1_ConvertToVector128Byte: + case NI_AVX10v1_ConvertToVector128ByteWithSaturation: case NI_AVX10v1_ConvertToVector128Int16: case NI_AVX10v1_ConvertToVector128Int16WithSaturation: case NI_AVX10v1_ConvertToVector128Int32: @@ -19818,6 +19820,7 @@ bool GenTree::isRMWHWIntrinsic(Compiler* comp) case NI_AVX512F_FixupScalar: case NI_AVX512F_VL_Fixup: case NI_AVX10v1_Fixup: + case NI_AVX10v1_FixupScalar: { // We are actually only RMW in the case where the lookup table // has any value that could result in `op1` being picked. So @@ -19845,7 +19848,7 @@ bool GenTree::isRMWHWIntrinsic(Compiler* comp) uint32_t count = simdSize / sizeof(uint32_t); uint32_t incSize = (simdBaseType == TYP_FLOAT) ? 1 : 2; - if (intrinsicId == NI_AVX512F_FixupScalar) + if (intrinsicId == NI_AVX512F_FixupScalar || intrinsicId == NI_AVX10v1_FixupScalar) { // Upper elements come from op2 count = 1; @@ -20090,6 +20093,10 @@ GenTree* Compiler::gtNewSimdAbsNode(var_types type, GenTree* op1, CorInfoType si assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F)); intrinsic = NI_AVX512F_Abs; } + else if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + intrinsic = NI_AVX10v1_Abs; + } else if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) { intrinsic = NI_AVX512F_VL_Abs; @@ -20441,8 +20448,16 @@ GenTree* Compiler::gtNewSimdBinOpNode( if (varTypeIsLong(simdBaseType) || (simdBaseType == TYP_DOUBLE)) { assert(varTypeIsSigned(simdBaseType)); - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F_VL)); - intrinsic = NI_AVX512F_VL_ShiftRightArithmetic; + assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F_VL) || + compIsaSupportedDebugOnly(InstructionSet_AVX10v1)); + if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + intrinsic = NI_AVX10v1_ShiftRightArithmetic; + } + else + { + intrinsic = NI_AVX512F_VL_ShiftRightArithmetic; + } } else { @@ -20503,8 +20518,14 @@ GenTree* Compiler::gtNewSimdBinOpNode( if (varTypeIsLong(simdBaseType) || (simdBaseType == TYP_DOUBLE)) { assert(varTypeIsSigned(simdBaseType)); - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F_VL)); - intrinsic = NI_AVX512F_VL_ShiftRightArithmetic; + if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + intrinsic = NI_AVX10v1_ShiftRightArithmetic; + } + else + { + intrinsic = NI_AVX512F_VL_ShiftRightArithmetic; + } } else { @@ -20618,7 +20639,45 @@ GenTree* Compiler::gtNewSimdBinOpNode( } else if (simdSize == 16 && compOpportunisticallyDependsOn(InstructionSet_AVX2)) { - if (IsBaselineVector512IsaSupportedOpportunistically()) + if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + // Input is SIMD16 [U]Byte and AVX512BW_VL is supported: + // - Widen inputs as SIMD32 [U]Short + // - Multiply widened inputs (SIMD32 [U]Short) as widened product (SIMD32 [U]Short) + // - Narrow widened product (SIMD32 [U]Short) as SIMD16 [U]Byte + widenIntrinsic = NI_AVX2_ConvertToVector256Int16; + + if (simdBaseType == TYP_BYTE) + { + widenedSimdBaseJitType = CORINFO_TYPE_SHORT; + narrowIntrinsic = NI_AVX10v1_ConvertToVector128SByte; + } + else + { + widenedSimdBaseJitType = CORINFO_TYPE_USHORT; + narrowIntrinsic = NI_AVX10v1_ConvertToVector128Byte; + } + + widenedType = TYP_SIMD32; + widenedSimdSize = 32; + + // Vector256 widenedOp1 = Avx2.ConvertToVector256Int16(op1).AsUInt16() + GenTree* widenedOp1 = gtNewSimdHWIntrinsicNode(widenedType, op1, widenIntrinsic, + simdBaseJitType, widenedSimdSize); + + // Vector256 widenedOp2 = Avx2.ConvertToVector256Int16(op2).AsUInt16() + GenTree* widenedOp2 = gtNewSimdHWIntrinsicNode(widenedType, op2, widenIntrinsic, + simdBaseJitType, widenedSimdSize); + + // Vector256 widenedProduct = widenedOp1 * widenedOp2 + GenTree* widenedProduct = gtNewSimdBinOpNode(GT_MUL, widenedType, widenedOp1, widenedOp2, + widenedSimdBaseJitType, widenedSimdSize); + + // Vector128 product = Avx512BW.VL.ConvertToVector128Byte(widenedProduct) + return gtNewSimdHWIntrinsicNode(type, widenedProduct, narrowIntrinsic, + widenedSimdBaseJitType, widenedSimdSize); + } + else if (IsBaselineVector512IsaSupportedOpportunistically()) { // Input is SIMD16 [U]Byte and AVX512BW_VL is supported: // - Widen inputs as SIMD32 [U]Short @@ -20838,9 +20897,14 @@ GenTree* Compiler::gtNewSimdBinOpNode( case TYP_ULONG: { assert((simdSize == 16) || (simdSize == 32) || (simdSize == 64)); - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512DQ_VL)); + assert(compIsaSupportedDebugOnly(InstructionSet_AVX512DQ_VL) || + compIsaSupportedDebugOnly(InstructionSet_AVX10v1)); - if (simdSize != 64) + if (simdSize != 64 && compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + intrinsic = NI_AVX10v1_MultiplyLow; + } + else if (simdSize != 64) { intrinsic = NI_AVX512DQ_VL_MultiplyLow; } @@ -21390,12 +21454,45 @@ GenTree* Compiler::gtNewSimdCvtNode(var_types type, #if defined(TARGET_XARCH) assert(IsBaselineVector512IsaSupportedDebugOnly() || + (simdSize != 64 && compIsaSupportedDebugOnly(InstructionSet_AVX10v1)) || ((simdTargetBaseType == TYP_INT) && ((simdSize == 16 && compIsaSupportedDebugOnly(InstructionSet_SSE41)) || (simdSize == 32 && compIsaSupportedDebugOnly(InstructionSet_AVX))))); GenTree* fixupVal; - if (IsBaselineVector512IsaSupportedOpportunistically()) + if (simdSize != 64 && compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + /*Generate the control table for VFIXUPIMMSD/SS + - For conversion to unsigned + // QNAN: 0b1000: Saturate to Zero + // SNAN: 0b1000: Saturate to Zero + // ZERO: 0b0000 + // +ONE: 0b0000 + // -INF: 0b1000: Saturate to Zero + // +INF: 0b0000 + // -VAL: 0b1000: Saturate to Zero + // +VAL: 0b0000 + - For conversion to signed + // QNAN: 0b1000: Saturate to Zero + // SNAN: 0b1000: Saturate to Zero + // ZERO: 0b0000 + // +ONE: 0b0000 + // -INF: 0b0000 + // +INF: 0b0000 + // -VAL: 0b0000 + // +VAL: 0b0000 + */ + int32_t iconVal = varTypeIsUnsigned(simdTargetBaseType) ? 0x08080088 : 0x00000088; + GenTree* tblCon = gtNewSimdCreateBroadcastNode(type, gtNewIconNode(iconVal), simdTargetBaseJitType, simdSize); + + // We need op1Clone to run fixup + GenTree* op1Clone = fgMakeMultiUse(&op1); + + // run vfixupimmsd base on table and no flags reporting + fixupVal = gtNewSimdHWIntrinsicNode(type, op1, op1Clone, tblCon, gtNewIconNode(0), NI_AVX10v1_Fixup, + simdSourceBaseJitType, simdSize); + } + else if (IsBaselineVector512IsaSupportedOpportunistically()) { /*Generate the control table for VFIXUPIMMSD/SS - For conversion to unsigned @@ -21506,145 +21603,265 @@ GenTree* Compiler::gtNewSimdCvtNativeNode(var_types type, #if defined(TARGET_XARCH) assert(IsBaselineVector512IsaSupportedDebugOnly() || + (simdSize != 64 && compIsaSupportedDebugOnly(InstructionSet_AVX10v1)) || ((simdTargetBaseType == TYP_INT) && ((simdSize == 16) || (simdSize == 32 && compIsaSupportedDebugOnly(InstructionSet_AVX))))); - switch (simdSourceBaseJitType) + if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) { - case CORINFO_TYPE_FLOAT: + switch (simdSourceBaseJitType) { - switch (simdTargetBaseJitType) + case CORINFO_TYPE_FLOAT: { - case CORINFO_TYPE_INT: + switch (simdTargetBaseJitType) { - switch (simdSize) + case CORINFO_TYPE_INT: { - case 64: + switch (simdSize) { - hwIntrinsicID = NI_AVX512F_ConvertToVector512Int32WithTruncation; - break; - } + case 32: + { + hwIntrinsicID = NI_AVX_ConvertToVector256Int32WithTruncation; + break; + } - case 32: - { - hwIntrinsicID = NI_AVX_ConvertToVector256Int32WithTruncation; - break; + case 16: + { + hwIntrinsicID = NI_SSE2_ConvertToVector128Int32WithTruncation; + break; + } + + default: + unreached(); } + break; + } - case 16: + case CORINFO_TYPE_UINT: + { + switch (simdSize) { - hwIntrinsicID = NI_SSE2_ConvertToVector128Int32WithTruncation; - break; - } + case 32: + { + hwIntrinsicID = NI_AVX10v1_ConvertToVector256UInt32WithTruncation; + break; + } - default: - unreached(); + case 16: + { + hwIntrinsicID = NI_AVX10v1_ConvertToVector128UInt32WithTruncation; + break; + } + + default: + unreached(); + } + break; } - break; + + default: + unreached(); } + break; + } - case CORINFO_TYPE_UINT: + case CORINFO_TYPE_DOUBLE: + { + switch (simdTargetBaseJitType) { - switch (simdSize) + case CORINFO_TYPE_LONG: { - case 64: + switch (simdSize) { - hwIntrinsicID = NI_AVX512F_ConvertToVector512UInt32WithTruncation; - break; - } + case 32: + { + hwIntrinsicID = NI_AVX10v1_ConvertToVector256Int64WithTruncation; + break; + } - case 32: - { - hwIntrinsicID = NI_AVX512F_VL_ConvertToVector256UInt32WithTruncation; - break; + case 16: + { + hwIntrinsicID = NI_AVX10v1_ConvertToVector128Int64WithTruncation; + break; + } + + default: + unreached(); } + break; + } - case 16: + case CORINFO_TYPE_ULONG: + { + switch (simdSize) { - hwIntrinsicID = NI_AVX512F_VL_ConvertToVector128UInt32WithTruncation; - break; - } + case 32: + { + hwIntrinsicID = NI_AVX10v1_ConvertToVector256UInt64WithTruncation; + break; + } - default: - unreached(); + case 16: + { + hwIntrinsicID = NI_AVX10v1_ConvertToVector128UInt64WithTruncation; + break; + } + + default: + unreached(); + } + break; } - break; - } - default: - unreached(); + default: + unreached(); + } + break; } - break; - } - case CORINFO_TYPE_DOUBLE: + default: + unreached(); + } + } + else + { + switch (simdSourceBaseJitType) { - switch (simdTargetBaseJitType) + case CORINFO_TYPE_FLOAT: { - case CORINFO_TYPE_LONG: + switch (simdTargetBaseJitType) { - switch (simdSize) + case CORINFO_TYPE_INT: { - case 64: + switch (simdSize) { - hwIntrinsicID = NI_AVX512DQ_ConvertToVector512Int64WithTruncation; - break; - } + case 64: + { + hwIntrinsicID = NI_AVX512F_ConvertToVector512Int32WithTruncation; + break; + } - case 32: - { - hwIntrinsicID = NI_AVX512DQ_VL_ConvertToVector256Int64WithTruncation; - break; + case 32: + { + hwIntrinsicID = NI_AVX_ConvertToVector256Int32WithTruncation; + break; + } + + case 16: + { + hwIntrinsicID = NI_SSE2_ConvertToVector128Int32WithTruncation; + break; + } + + default: + unreached(); } + break; + } - case 16: + case CORINFO_TYPE_UINT: + { + switch (simdSize) { - hwIntrinsicID = NI_AVX512DQ_VL_ConvertToVector128Int64WithTruncation; - break; - } + case 64: + { + hwIntrinsicID = NI_AVX512F_ConvertToVector512UInt32WithTruncation; + break; + } - default: - unreached(); + case 32: + { + hwIntrinsicID = NI_AVX512F_VL_ConvertToVector256UInt32WithTruncation; + break; + } + + case 16: + { + hwIntrinsicID = NI_AVX512F_VL_ConvertToVector128UInt32WithTruncation; + break; + } + + default: + unreached(); + } + break; } - break; + + default: + unreached(); } + break; + } - case CORINFO_TYPE_ULONG: + case CORINFO_TYPE_DOUBLE: + { + switch (simdTargetBaseJitType) { - switch (simdSize) + case CORINFO_TYPE_LONG: { - case 64: + switch (simdSize) { - hwIntrinsicID = NI_AVX512DQ_ConvertToVector512UInt64WithTruncation; - break; - } + case 64: + { + hwIntrinsicID = NI_AVX512DQ_ConvertToVector512Int64WithTruncation; + break; + } - case 32: - { - hwIntrinsicID = NI_AVX512DQ_VL_ConvertToVector256UInt64WithTruncation; - break; + case 32: + { + hwIntrinsicID = NI_AVX512DQ_VL_ConvertToVector256Int64WithTruncation; + break; + } + + case 16: + { + hwIntrinsicID = NI_AVX512DQ_VL_ConvertToVector128Int64WithTruncation; + break; + } + + default: + unreached(); } + break; + } - case 16: + case CORINFO_TYPE_ULONG: + { + switch (simdSize) { - hwIntrinsicID = NI_AVX512DQ_VL_ConvertToVector128UInt64WithTruncation; - break; - } + case 64: + { + hwIntrinsicID = NI_AVX512DQ_ConvertToVector512UInt64WithTruncation; + break; + } - default: - unreached(); + case 32: + { + hwIntrinsicID = NI_AVX512DQ_VL_ConvertToVector256UInt64WithTruncation; + break; + } + + case 16: + { + hwIntrinsicID = NI_AVX512DQ_VL_ConvertToVector128UInt64WithTruncation; + break; + } + + default: + unreached(); + } + break; } - break; - } - default: - unreached(); + default: + unreached(); + } + break; } - break; - } - default: - unreached(); + default: + unreached(); + } } #elif defined(TARGET_ARM64) assert((simdSize == 8) || (simdSize == 16)); @@ -23861,6 +24078,10 @@ GenTree* Compiler::gtNewSimdMaxNode( { intrinsic = NI_AVX2_Max; } + else if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + intrinsic = NI_AVX10v1_Max; + } else if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) { intrinsic = NI_AVX512F_VL_Max; @@ -23974,7 +24195,11 @@ GenTree* Compiler::gtNewSimdMaxNode( case TYP_LONG: case TYP_ULONG: { - if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) + if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + intrinsic = NI_AVX10v1_Max; + } + else if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) { intrinsic = NI_AVX512F_VL_Max; } @@ -24068,6 +24293,10 @@ GenTree* Compiler::gtNewSimdMinNode( { intrinsic = NI_AVX2_Min; } + else if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + intrinsic = NI_AVX10v1_Min; + } else if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) { intrinsic = NI_AVX512F_VL_Min; @@ -24177,7 +24406,11 @@ GenTree* Compiler::gtNewSimdMinNode( case TYP_LONG: case TYP_ULONG: { - if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) + if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + intrinsic = NI_AVX10v1_Min; + } + else if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) { intrinsic = NI_AVX512F_VL_Min; } @@ -24258,8 +24491,96 @@ GenTree* Compiler::gtNewSimdNarrowNode( #if defined(TARGET_XARCH) GenTree* tmp3; GenTree* tmp4; + if ((simdSize != 64) && compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + // This is the same in principle to the other comments below, however due to + // code formatting, its too long to reasonably display here. + + assert((simdSize == 16) || (simdSize == 32)); + var_types tmpSimdType = TYP_SIMD16; - if (IsBaselineVector512IsaSupportedOpportunistically()) + NamedIntrinsic intrinsicId; + CorInfoType opBaseJitType; + + switch (simdBaseType) + { + case TYP_BYTE: + { + intrinsicId = NI_AVX10v1_ConvertToVector128SByte; + opBaseJitType = CORINFO_TYPE_SHORT; + break; + } + + case TYP_UBYTE: + { + intrinsicId = NI_AVX10v1_ConvertToVector128Byte; + opBaseJitType = CORINFO_TYPE_USHORT; + break; + } + + case TYP_SHORT: + { + intrinsicId = NI_AVX10v1_ConvertToVector128Int16; + opBaseJitType = CORINFO_TYPE_INT; + break; + } + + case TYP_USHORT: + { + intrinsicId = NI_AVX10v1_ConvertToVector128UInt16; + opBaseJitType = CORINFO_TYPE_UINT; + break; + } + + case TYP_INT: + { + intrinsicId = NI_AVX10v1_ConvertToVector128Int32; + opBaseJitType = CORINFO_TYPE_LONG; + break; + } + + case TYP_UINT: + { + intrinsicId = NI_AVX10v1_ConvertToVector128UInt32; + opBaseJitType = CORINFO_TYPE_ULONG; + break; + } + + case TYP_FLOAT: + { + if (simdSize == 32) + { + intrinsicId = NI_AVX_ConvertToVector128Single; + } + else + { + intrinsicId = NI_SSE2_ConvertToVector128Single; + } + + opBaseJitType = CORINFO_TYPE_DOUBLE; + break; + } + + default: + { + unreached(); + } + } + + tmp1 = gtNewSimdHWIntrinsicNode(tmpSimdType, op1, intrinsicId, opBaseJitType, simdSize); + tmp2 = gtNewSimdHWIntrinsicNode(tmpSimdType, op2, intrinsicId, opBaseJitType, simdSize); + + if (simdSize == 16) + { + return gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_SSE_MoveLowToHigh, CORINFO_TYPE_FLOAT, simdSize); + } + + intrinsicId = NI_Vector128_ToVector256Unsafe; + + tmp1 = gtNewSimdHWIntrinsicNode(type, tmp1, intrinsicId, simdBaseJitType, simdSize / 2); + return gtNewSimdWithUpperNode(type, tmp1, tmp2, simdBaseJitType, simdSize); + } + else if (IsBaselineVector512IsaSupportedOpportunistically()) { // This is the same in principle to the other comments below, however due to // code formatting, its too long to reasonably display here. @@ -24866,8 +25187,9 @@ GenTree* Compiler::gtNewSimdShuffleNode( { assert(compIsaSupportedDebugOnly(InstructionSet_AVX2)); - if ((varTypeIsByte(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI_VL)) || - (varTypeIsShort(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512BW_VL))) + if (((varTypeIsByte(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI_VL)) || + (varTypeIsShort(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512BW_VL))) && + (!compOpportunisticallyDependsOn(InstructionSet_AVX10v1))) { if (crossLane) { @@ -24920,6 +25242,34 @@ GenTree* Compiler::gtNewSimdShuffleNode( // swap the operands to match the encoding requirements retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX2_PermuteVar8x32, simdBaseJitType, simdSize); } + else if (((elementSize == 2) || (elementSize == 1)) && compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + if (elementSize == 2) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX10v1)); + for (uint32_t i = 0; i < elementCount; i++) + { + vecCns.u16[i] = (uint8_t)(vecCns.u8[i * elementSize] / elementSize); + } + + op2 = gtNewVconNode(type); + op2->AsVecCon()->gtSimdVal = vecCns; + + // swap the operands to match the encoding requirements + retNode = + gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX10v1_PermuteVar16x16, simdBaseJitType, simdSize); + } + else // elementSize == 1 + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX512VBMI_VL)); + op2 = gtNewVconNode(type); + op2->AsVecCon()->gtSimdVal = vecCns; + + // swap the operands to match the encoding requirements + retNode = + gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX10v1_PermuteVar32x8, simdBaseJitType, simdSize); + } + } else if (elementSize == 2) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX512BW_VL)); @@ -25535,7 +25885,8 @@ GenTree* Compiler::gtNewSimdTernaryLogicNode(var_types type, CorInfoType simdBaseJitType, unsigned simdSize) { - assert(IsBaselineVector512IsaSupportedDebugOnly()); + assert(IsBaselineVector512IsaSupportedDebugOnly() || + ((simdSize != 64) && compIsaSupportedDebugOnly(InstructionSet_AVX10v1))); assert(varTypeIsSIMD(type)); assert(getSIMDTypeForSize(simdSize) == type); @@ -25561,6 +25912,11 @@ GenTree* Compiler::gtNewSimdTernaryLogicNode(var_types type, { intrinsic = NI_AVX512F_TernaryLogic; } + else if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + assert((simdSize == 16) || (simdSize == 32)); + intrinsic = NI_AVX10v1_TernaryLogic; + } else { assert((simdSize == 16) || (simdSize == 32)); @@ -25707,8 +26063,11 @@ GenTree* Compiler::gtNewSimdUnOpNode( { assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); } - - if ((genTypeSize(simdBaseType) >= 4) && compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) + if ((genTypeSize(simdBaseType) >= 4) && compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + intrinsic = NI_AVX10v1_TernaryLogic; + } + else if ((genTypeSize(simdBaseType) >= 4) && compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) { intrinsic = NI_AVX512F_VL_TernaryLogic; } @@ -25717,7 +26076,8 @@ GenTree* Compiler::gtNewSimdUnOpNode( if (intrinsic != NI_Illegal) { // AVX512 allows performing `not` without requiring a memory access - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F_VL)); + assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F_VL) || + compIsaSupportedDebugOnly(InstructionSet_AVX10v1)); op2 = gtNewZeroConNode(type); op3 = gtNewZeroConNode(type); @@ -26936,6 +27296,8 @@ bool GenTreeHWIntrinsic::OperIsEmbRoundingEnabled() const case NI_AVX512DQ_ConvertToVector512Double: case NI_AVX512DQ_ConvertToVector512Int64: case NI_AVX512DQ_ConvertToVector512UInt64: + case NI_AVX10v1_ConvertToInt32: + case NI_AVX10v1_ConvertToUInt32: case NI_AVX10v1_V512_ConvertToVector256Single: case NI_AVX10v1_V512_ConvertToVector512Double: case NI_AVX10v1_V512_ConvertToVector512Int64: diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index ee39ebbd61598d..7c7305690cac98 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1447,6 +1447,8 @@ void CodeGen::genNonTableDrivenHWIntrinsicsJumpTableFallback(GenTreeHWIntrinsic* case NI_AVX512F_ConvertToInt32: case NI_AVX512F_ConvertToUInt32: + case NI_AVX10v1_ConvertToInt32: + case NI_AVX10v1_ConvertToUInt32: #if defined(TARGET_AMD64) case NI_AVX512F_X64_ConvertToInt64: case NI_AVX512F_X64_ConvertToUInt64: @@ -2833,6 +2835,9 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node, insOpts instOption case NI_AVX512F_X64_ConvertToInt64: case NI_AVX512F_X64_ConvertToUInt64: case NI_AVX512F_X64_ConvertToUInt64WithTruncation: + case NI_AVX10v1_ConvertToInt32: + case NI_AVX10v1_ConvertToUInt32: + case NI_AVX10v1_ConvertToUInt32WithTruncation: { assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT); emitAttr attr = emitTypeSize(targetType); diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index a336d284c1679c..4392ec857c4353 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -1379,17 +1379,17 @@ HARDWARE_INTRINSIC(AVX512F, XorMask, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX10V1 Intrinsics -HARDWARE_INTRINSIC(AVX10v1, Abs, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpabsq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX10v1, Abs, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpabsq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, AddScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_addsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX10v1, AlignRight32, -1, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignd, INS_valignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX10v1, AlignRight64, -1, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignq, INS_valignq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX10v1, AlignRight32, -1, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignd, INS_valignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, AlignRight64, -1, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignq, INS_valignq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, BroadcastPairScalarToVector128, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x2, INS_vbroadcasti32x2, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX10v1, BroadcastPairScalarToVector256, 32, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x2, INS_vbroadcasti32x2, INS_invalid, INS_invalid, INS_vbroadcastf32x2, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX10v1, CompareGreaterThan, -1, 2, true, {INS_invalid, INS_vpcmpub, INS_invalid, INS_vpcmpuw, INS_invalid, INS_vpcmpud, INS_invalid, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX10v1, CompareGreaterThanOrEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX10v1, CompareLessThan, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX10v1, CompareLessThanOrEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(AVX10v1, CompareNotEqual, -1, 2, true, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX10v1, CompareGreaterThan, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(AVX10v1, CompareGreaterThanOrEqual, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(AVX10v1, CompareLessThan, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(AVX10v1, CompareLessThanOrEqual, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(AVX10v1, CompareNotEqual, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX10v1, ConvertScalarToVector128Double, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2sd32, INS_vcvtusi2sd32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX10v1, ConvertScalarToVector128Single, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss32, INS_vcvtusi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_cvtsd2ss}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) HARDWARE_INTRINSIC(AVX10v1, ConvertToInt32, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_cvtsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_EmbRoundingCompatible) @@ -1397,86 +1397,86 @@ HARDWARE_INTRINSIC(AVX10v1, ConvertToUInt32, HARDWARE_INTRINSIC(AVX10v1, ConvertToUInt32WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi32, INS_vcvttsd2usi32}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128Byte, -1, 1, true, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_vpmovdb, INS_vpmovdb, INS_vpmovqb, INS_vpmovqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128ByteWithSaturation, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_vpmovuswb, INS_invalid, INS_vpmovusdb, INS_invalid, INS_vpmovusqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128Double, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtudq2pd, INS_vcvtqq2pd, INS_vcvtuqq2pd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128Double, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtudq2pd, INS_vcvtqq2pd, INS_vcvtuqq2pd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128Int16, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovdw, INS_vpmovdw, INS_vpmovqw, INS_vpmovqw, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128Int16WithSaturation, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovsdw, INS_invalid, INS_vpmovsqw, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128Int32, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovqd, INS_vpmovqd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128Int32WithSaturation, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovsqd, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128Int64, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2qq, INS_vcvtpd2qq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128Int64WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttps2qq, INS_vcvttpd2qq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128Int64, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2qq, INS_vcvtpd2qq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128Int64WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttps2qq, INS_vcvttpd2qq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128SByte, -1, 1, true, {INS_invalid, INS_invalid, INS_vpmovwb, INS_vpmovwb, INS_vpmovdb, INS_vpmovdb, INS_vpmovqb, INS_vpmovqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128SByteWithSaturation, -1, 1, true, {INS_invalid, INS_invalid, INS_vpmovswb, INS_invalid, INS_vpmovsdb, INS_invalid, INS_vpmovsqb, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128Single, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtudq2ps, INS_vcvtqq2ps, INS_vcvtuqq2ps, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128Single, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtudq2ps, INS_vcvtqq2ps, INS_vcvtuqq2ps, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128UInt16, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovdw, INS_vpmovdw, INS_vpmovqw, INS_vpmovqw, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128UInt16WithSaturation, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovusdw, INS_invalid, INS_vpmovusqw, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128UInt32, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovqd, INS_vpmovqd, INS_vcvtps2udq, INS_vcvtpd2udq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128UInt32WithSaturation, -1, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovusqd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128UInt32WithTruncation, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttps2udq, INS_vcvttpd2udq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128UInt64, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2uqq, INS_vcvtpd2uqq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128UInt64WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttps2uqq, INS_vcvttpd2uqq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX10v1, ConvertToVector256Double, 32, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtudq2pd, INS_vcvtqq2pd, INS_vcvtuqq2pd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX10v1, ConvertToVector256Int64, 32, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2qq, INS_vcvtpd2qq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX10v1, ConvertToVector256Int64WithTruncation, 32, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttps2qq, INS_vcvttpd2qq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX10v1, ConvertToVector256Single, 32, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtudq2ps, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX10v1, ConvertToVector256UInt32, 32, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2udq, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX10v1, ConvertToVector256UInt32WithTruncation, 32, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttps2udq, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX10v1, ConvertToVector256UInt64, 32, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2uqq, INS_vcvtpd2uqq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX10v1, ConvertToVector256UInt64WithTruncation, 32, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttps2uqq, INS_vcvttpd2uqq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX10v1, DetectConflicts, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpconflictd, INS_vpconflictd, INS_vpconflictq, INS_vpconflictq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128UInt32WithTruncation, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttps2udq, INS_vcvttpd2udq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128UInt64, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2uqq, INS_vcvtpd2uqq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, ConvertToVector128UInt64WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttps2uqq, INS_vcvttpd2uqq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, ConvertToVector256Double, 32, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtudq2pd, INS_vcvtqq2pd, INS_vcvtuqq2pd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, ConvertToVector256Int64, 32, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2qq, INS_vcvtpd2qq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, ConvertToVector256Int64WithTruncation, 32, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttps2qq, INS_vcvttpd2qq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, ConvertToVector256Single, 32, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtudq2ps, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, ConvertToVector256UInt32, 32, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2udq, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, ConvertToVector256UInt32WithTruncation, 32, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttps2udq, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, ConvertToVector256UInt64, 32, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2uqq, INS_vcvtpd2uqq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, ConvertToVector256UInt64WithTruncation, 32, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttps2uqq, INS_vcvttpd2uqq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, DetectConflicts, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpconflictd, INS_vpconflictd, INS_vpconflictq, INS_vpconflictq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, DivideScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divss, INS_divsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX10v1, Fixup, -1, 4, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfixupimmps, INS_vfixupimmpd}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX10v1, Fixup, -1, 4, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfixupimmps, INS_vfixupimmpd}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, FixupScalar, 16, 4, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfixupimmss, INS_vfixupimmsd}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(AVX10v1, FusedMultiplyAddNegatedScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfnmadd213ss, INS_vfnmadd213sd}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_FmaIntrinsic|HW_Flag_RmwIntrinsic|HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX10v1, FusedMultiplyAddScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmadd213ss, INS_vfmadd213sd}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_FmaIntrinsic|HW_Flag_RmwIntrinsic|HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX10v1, FusedMultiplySubtractNegatedScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfnmsub213ss, INS_vfnmsub213sd}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_FmaIntrinsic|HW_Flag_RmwIntrinsic|HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX10v1, FusedMultiplySubtractScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmsub213ss, INS_vfmsub213sd}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_FmaIntrinsic|HW_Flag_RmwIntrinsic|HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX10v1, GetExponent, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vgetexpps, INS_vgetexppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX10v1, FusedMultiplyAddNegatedScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfnmadd213ss, INS_vfnmadd213sd}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_FmaIntrinsic|HW_Flag_RmwIntrinsic|HW_Flag_EmbRoundingCompatible|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(AVX10v1, FusedMultiplyAddScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmadd213ss, INS_vfmadd213sd}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_FmaIntrinsic|HW_Flag_RmwIntrinsic|HW_Flag_EmbRoundingCompatible|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(AVX10v1, FusedMultiplySubtractNegatedScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfnmsub213ss, INS_vfnmsub213sd}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_FmaIntrinsic|HW_Flag_RmwIntrinsic|HW_Flag_EmbRoundingCompatible|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(AVX10v1, FusedMultiplySubtractScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmsub213ss, INS_vfmsub213sd}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_FmaIntrinsic|HW_Flag_RmwIntrinsic|HW_Flag_EmbRoundingCompatible|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(AVX10v1, GetExponent, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vgetexpps, INS_vgetexppd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, GetExponentScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vgetexpss, INS_vgetexpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(AVX10v1, GetMantissa, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vgetmantps, INS_vgetmantpd}, HW_Category_IMM, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX10v1, GetMantissa, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vgetmantps, INS_vgetmantpd}, HW_Category_IMM, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, GetMantissaScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vgetmantss, INS_vgetmantsd}, HW_Category_IMM, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(AVX10v1, LeadingZeroCount, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vplzcntd, INS_vplzcntd, INS_vplzcntq, INS_vplzcntq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX10v1, Max, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaxsq, INS_vpmaxuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) -HARDWARE_INTRINSIC(AVX10v1, Min, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpminsq, INS_vpminuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) -HARDWARE_INTRINSIC(AVX10v1, MultiplyLow, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmullq, INS_vpmullq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) +HARDWARE_INTRINSIC(AVX10v1, LeadingZeroCount, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vplzcntd, INS_vplzcntd, INS_vplzcntq, INS_vplzcntq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, Max, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaxsq, INS_vpmaxuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX10v1, Min, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpminsq, INS_vpminuq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX10v1, MultiplyLow, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmullq, INS_vpmullq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, MultiplyScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulss, INS_mulsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX10v1, PermuteVar16x16, 32, 2, false, {INS_invalid, INS_invalid, INS_vpermw, INS_vpermw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(AVX10v1, PermuteVar16x16x2, 32, 3, false, {INS_invalid, INS_invalid, INS_vpermt2w, INS_vpermt2w, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic) -HARDWARE_INTRINSIC(AVX10v1, PermuteVar16x8, 16, 2, false, {INS_vpermb, INS_vpermb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(AVX10v1, PermuteVar16x8x2, 16, 3, false, {INS_vpermt2b, INS_vpermt2b, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic) -HARDWARE_INTRINSIC(AVX10v1, PermuteVar2x64x2, 16, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermt2q, INS_vpermt2q, INS_invalid, INS_vpermt2pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic) -HARDWARE_INTRINSIC(AVX10v1, PermuteVar32x8, 32, 2, false, {INS_vpermb, INS_vpermb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(AVX10v1, PermuteVar32x8x2, 32, 3, false, {INS_vpermt2b, INS_vpermt2b, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic) -HARDWARE_INTRINSIC(AVX10v1, PermuteVar4x64, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermq_reg, INS_vpermq_reg, INS_invalid, INS_vpermpd_reg}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(AVX10v1, PermuteVar4x64x2, 32, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermt2q, INS_vpermt2q, INS_invalid, INS_vpermt2pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic) -HARDWARE_INTRINSIC(AVX10v1, PermuteVar4x32x2, 16, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermt2d, INS_vpermt2d, INS_invalid, INS_invalid, INS_vpermt2ps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic) -HARDWARE_INTRINSIC(AVX10v1, PermuteVar8x16 , 16, 2, false, {INS_invalid, INS_invalid, INS_vpermw, INS_vpermw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(AVX10v1, PermuteVar8x16x2, 16, 3, false, {INS_invalid, INS_invalid, INS_vpermt2w, INS_vpermt2w, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic) -HARDWARE_INTRINSIC(AVX10v1, PermuteVar8x32x2, 32, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermt2d, INS_vpermt2d, INS_invalid, INS_invalid, INS_vpermt2ps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic) -HARDWARE_INTRINSIC(AVX10v1, Range, -1, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangeps, INS_vrangepd}, HW_Category_IMM, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX10v1, PermuteVar16x16, 32, 2, false, {INS_invalid, INS_invalid, INS_vpermw, INS_vpermw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, PermuteVar16x16x2, 32, 3, false, {INS_invalid, INS_invalid, INS_vpermt2w, INS_vpermt2w, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, PermuteVar16x8, 16, 2, false, {INS_vpermb, INS_vpermb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, PermuteVar16x8x2, 16, 3, false, {INS_vpermt2b, INS_vpermt2b, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, PermuteVar2x64x2, 16, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermt2q, INS_vpermt2q, INS_invalid, INS_vpermt2pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, PermuteVar32x8, 32, 2, false, {INS_vpermb, INS_vpermb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, PermuteVar32x8x2, 32, 3, false, {INS_vpermt2b, INS_vpermt2b, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, PermuteVar4x64, 32, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermq_reg, INS_vpermq_reg, INS_invalid, INS_vpermpd_reg}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, PermuteVar4x64x2, 32, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermt2q, INS_vpermt2q, INS_invalid, INS_vpermt2pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, PermuteVar4x32x2, 16, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermt2d, INS_vpermt2d, INS_invalid, INS_invalid, INS_vpermt2ps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, PermuteVar8x16, 16, 2, false, {INS_invalid, INS_invalid, INS_vpermw, INS_vpermw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, PermuteVar8x16x2, 16, 3, false, {INS_invalid, INS_invalid, INS_vpermt2w, INS_vpermt2w, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, PermuteVar8x32x2, 32, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermt2d, INS_vpermt2d, INS_invalid, INS_invalid, INS_vpermt2ps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, Range, -1, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangeps, INS_vrangepd}, HW_Category_IMM, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, RangeScalar, 16, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangess, INS_vrangesd}, HW_Category_IMM, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(AVX10v1, Reciprocal14, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrcp14ps, INS_vrcp14pd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX10v1, Reciprocal14, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrcp14ps, INS_vrcp14pd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, Reciprocal14Scalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrcp14ss, INS_vrcp14sd}, HW_Category_SimpleSIMD, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(AVX10v1, ReciprocalSqrt14, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrsqrt14ps, INS_vrsqrt14pd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX10v1, ReciprocalSqrt14, -1, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrsqrt14ps, INS_vrsqrt14pd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, ReciprocalSqrt14Scalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrsqrt14ss, INS_vrsqrt14sd}, HW_Category_SimpleSIMD, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(AVX10v1, Reduce, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreduceps, INS_vreducepd}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX10v1, Reduce, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreduceps, INS_vreducepd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, ReduceScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreducess, INS_vreducesd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(AVX10v1, RotateLeft, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vprold, INS_vprold, INS_vprolq, INS_vprolq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_MaybeNoJmpTableIMM|HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX10v1, RotateLeftVariable, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vprolvd, INS_vprolvd, INS_vprolvq, INS_vprolvq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX10v1, RotateRight, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vprord, INS_vprord, INS_vprorq, INS_vprorq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_MaybeNoJmpTableIMM|HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX10v1, RotateRightVariable, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vprorvd, INS_vprorvd, INS_vprorvq, INS_vprorvq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX10v1, RoundScale, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrndscaleps, INS_vrndscalepd}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX10v1, RotateLeft, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vprold, INS_vprold, INS_vprolq, INS_vprolq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_MaybeNoJmpTableIMM|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, RotateLeftVariable, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vprolvd, INS_vprolvd, INS_vprolvq, INS_vprolvq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, RotateRight, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vprord, INS_vprord, INS_vprorq, INS_vprorq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_MaybeNoJmpTableIMM|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, RotateRightVariable, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vprorvd, INS_vprorvd, INS_vprorvq, INS_vprorvq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, RoundScale, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrndscaleps, INS_vrndscalepd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, RoundScaleScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrndscaless, INS_vrndscalesd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(AVX10v1, Scale, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vscalefps, INS_vscalefpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX10v1, Scale, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vscalefps, INS_vscalefpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, ScaleScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vscalefss, INS_vscalefsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX10v1, ShiftLeftLogicalVariable, -1, 2, false, {INS_invalid, INS_invalid, INS_vpsllvw, INS_vpsllvw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX10v1, ShiftRightArithmetic, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpsraq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX10v1, ShiftLeftLogicalVariable, -1, 2, false, {INS_invalid, INS_invalid, INS_vpsllvw, INS_vpsllvw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, ShiftRightArithmetic, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpsraq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, ShiftRightArithmeticVariable, -1, 2, true, {INS_invalid, INS_invalid, INS_vpsravw, INS_invalid, INS_invalid, INS_invalid, INS_vpsravq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible) -HARDWARE_INTRINSIC(AVX10v1, ShiftRightLogicalVariable, -1, 2, false, {INS_invalid, INS_invalid, INS_vpsrlvw, INS_vpsrlvw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX10v1, Shuffle2x128, 32, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vshufi32x4, INS_vshufi32x4, INS_vshufi64x2, INS_vshufi64x2, INS_vshuff32x4, INS_vshuff64x2}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX10v1, ShiftRightLogicalVariable, -1, 2, false, {INS_invalid, INS_invalid, INS_vpsrlvw, INS_vpsrlvw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, Shuffle2x128, 32, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vshufi32x4, INS_vshufi32x4, INS_vshufi64x2, INS_vshufi64x2, INS_vshuff32x4, INS_vshuff64x2}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1, SqrtScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_sqrtsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) HARDWARE_INTRINSIC(AVX10v1, SubtractScalar, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_subsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX10v1, SumAbsoluteDifferencesInBlock32, -1, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_vdbpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX10v1, TernaryLogic, -1, 4, true, {INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogq, INS_vpternlogq, INS_vpternlogd, INS_vpternlogq}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX10v1, SumAbsoluteDifferencesInBlock32, -1, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_vdbpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, TernaryLogic, -1, 4, true, {INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogq, INS_vpternlogq, INS_vpternlogd, INS_vpternlogq}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -1484,30 +1484,30 @@ HARDWARE_INTRINSIC(AVX10v1, TernaryLogic, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX10V1_V512 Intrinsics -HARDWARE_INTRINSIC(AVX10v1_V512, And, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX10v1_V512, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, And, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, AndNot, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX10v1_V512, BroadcastPairScalarToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x2, INS_vbroadcasti32x2, INS_invalid, INS_invalid, INS_vbroadcastf32x2, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX10v1_V512, BroadcastVector128ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti64x2, INS_vbroadcasti64x2, INS_invalid, INS_vbroadcastf64x2}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX10v1_V512, BroadcastVector256ToVector512, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x8, INS_vbroadcasti32x8, INS_invalid, INS_invalid, INS_vbroadcastf32x8, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX10v1_V512, ConvertToVector256Single, 64, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtqq2ps, INS_vcvtuqq2ps, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX10v1_V512, ConvertToVector512Double, 64, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtqq2pd, INS_vcvtuqq2pd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX10v1_V512, ConvertToVector512Int64, 64, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2qq, INS_vcvtpd2qq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX10v1_V512, ConvertToVector512Int64WithTruncation, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttps2qq, INS_vcvttpd2qq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX10v1_V512, ConvertToVector512UInt64, 64, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2uqq, INS_vcvtpd2uqq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX10v1_V512, ConvertToVector512UInt64WithTruncation, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttps2uqq, INS_vcvttpd2uqq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX10v1_V512, DetectConflicts, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpconflictd, INS_vpconflictd, INS_vpconflictq, INS_vpconflictq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX10v1_V512, ExtractVector128, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vextracti64x2, INS_vextracti64x2, INS_invalid, INS_vextractf64x2}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX10v1_V512, ExtractVector256, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vextracti32x8, INS_vextracti32x8, INS_invalid, INS_invalid, INS_vextractf32x8, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX10v1_V512, InsertVector128, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vinserti64x2, INS_vinserti64x2, INS_invalid, INS_vinsertf64x2}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX10v1_V512, InsertVector256, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vinserti32x8, INS_vinserti32x8, INS_invalid, INS_invalid, INS_vinsertf32x8, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX10v1_V512, LeadingZeroCount, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vplzcntd, INS_vplzcntd, INS_vplzcntq, INS_vplzcntq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX10v1_V512, MultiplyLow, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmullq, INS_vpmullq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) -HARDWARE_INTRINSIC(AVX10v1_V512, Or, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) -HARDWARE_INTRINSIC(AVX10v1_V512, PermuteVar64x8, 64, 2, false, {INS_vpermb, INS_vpermb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(AVX10v1_V512, PermuteVar64x8x2, 64, 3, false, {INS_vpermt2b, INS_vpermt2b, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic) -HARDWARE_INTRINSIC(AVX10v1_V512, Range, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangeps, INS_vrangepd}, HW_Category_IMM, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX10v1_V512, Reduce, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreduceps, INS_vreducepd}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX10v1_V512, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, ConvertToVector256Single, 64, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtqq2ps, INS_vcvtuqq2ps, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_EmbRoundingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, ConvertToVector512Double, 64, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtqq2pd, INS_vcvtuqq2pd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_EmbRoundingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, ConvertToVector512Int64, 64, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2qq, INS_vcvtpd2qq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_EmbRoundingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, ConvertToVector512Int64WithTruncation, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttps2qq, INS_vcvttpd2qq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, ConvertToVector512UInt64, 64, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtps2uqq, INS_vcvtpd2uqq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_EmbRoundingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, ConvertToVector512UInt64WithTruncation, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttps2uqq, INS_vcvttpd2uqq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, DetectConflicts, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpconflictd, INS_vpconflictd, INS_vpconflictq, INS_vpconflictq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, ExtractVector128, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vextracti64x2, INS_vextracti64x2, INS_invalid, INS_vextractf64x2}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, ExtractVector256, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vextracti32x8, INS_vextracti32x8, INS_invalid, INS_invalid, INS_vextractf32x8, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, InsertVector128, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vinserti64x2, INS_vinserti64x2, INS_invalid, INS_vinsertf64x2}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, InsertVector256, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vinserti32x8, INS_vinserti32x8, INS_invalid, INS_invalid, INS_vinsertf32x8, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, LeadingZeroCount, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vplzcntd, INS_vplzcntd, INS_vplzcntq, INS_vplzcntq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, MultiplyLow, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmullq, INS_vpmullq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, Or, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, PermuteVar64x8, 64, 2, false, {INS_vpermb, INS_vpermb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, PermuteVar64x8x2, 64, 3, false, {INS_vpermt2b, INS_vpermt2b, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_PermuteVar2x|HW_Flag_RmwIntrinsic|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, Range, 64, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vrangeps, INS_vrangepd}, HW_Category_IMM, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, Reduce, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vreduceps, INS_vreducepd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1_V512, Xor, 64, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) #endif // FEATURE_HW_INTRINSIC diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index ac1caa3f0e43fd..3e242da2bc77bc 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1458,7 +1458,21 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 1); assert(varTypeIsLong(simdBaseType)); - if (IsBaselineVector512IsaSupportedOpportunistically()) + if ((simdSize != 64) && compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + if (simdSize == 32) + { + intrinsic = NI_AVX10v1_ConvertToVector256Double; + } + else + { + assert(simdSize == 16); + intrinsic = NI_AVX10v1_ConvertToVector128Double; + } + op1 = impSIMDPopStack(); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); + } + else if (IsBaselineVector512IsaSupportedOpportunistically()) { if (simdSize == 64) { @@ -1513,7 +1527,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, assert(sig->numArgs == 1); assert(simdBaseType == TYP_DOUBLE); - if (IsBaselineVector512IsaSupportedOpportunistically()) + if (IsBaselineVector512IsaSupportedOpportunistically() || + (simdSize != 64 && compOpportunisticallyDependsOn(InstructionSet_AVX10v1))) { op1 = impSIMDPopStack(); retNode = gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_LONG, simdBaseJitType, simdSize); @@ -1528,7 +1543,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, assert(sig->numArgs == 1); assert(simdBaseType == TYP_DOUBLE); - if (IsBaselineVector512IsaSupportedOpportunistically()) + if (IsBaselineVector512IsaSupportedOpportunistically() || + (simdSize != 64 && compOpportunisticallyDependsOn(InstructionSet_AVX10v1))) { op1 = impSIMDPopStack(); retNode = gtNewSimdCvtNativeNode(retType, op1, CORINFO_TYPE_LONG, simdBaseJitType, simdSize); @@ -1560,6 +1576,21 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, unreached(); } } + else if (simdBaseType == TYP_UINT && simdSize != 64 && + compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + switch (simdSize) + { + case 16: + intrinsic = NI_AVX10v1_ConvertToVector128Single; + break; + case 32: + intrinsic = NI_AVX10v1_ConvertToVector256Single; + break; + default: + unreached(); + } + } else if (simdBaseType == TYP_UINT && IsBaselineVector512IsaSupportedOpportunistically()) { switch (simdSize) @@ -1592,7 +1623,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, assert(sig->numArgs == 1); assert(simdBaseType == TYP_FLOAT); - if (IsBaselineVector512IsaSupportedOpportunistically()) + if (IsBaselineVector512IsaSupportedOpportunistically() || + (simdSize != 64 && compOpportunisticallyDependsOn(InstructionSet_AVX10v1))) { op1 = impSIMDPopStack(); retNode = gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_UINT, simdBaseJitType, simdSize); @@ -1607,7 +1639,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, assert(sig->numArgs == 1); assert(simdBaseType == TYP_FLOAT); - if (IsBaselineVector512IsaSupportedOpportunistically()) + if (IsBaselineVector512IsaSupportedOpportunistically() || + (simdSize != 64 && compOpportunisticallyDependsOn(InstructionSet_AVX10v1))) { op1 = impSIMDPopStack(); retNode = gtNewSimdCvtNativeNode(retType, op1, CORINFO_TYPE_UINT, simdBaseJitType, simdSize); @@ -1621,7 +1654,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 1); assert(simdBaseType == TYP_DOUBLE); - if (IsBaselineVector512IsaSupportedOpportunistically()) + if (IsBaselineVector512IsaSupportedOpportunistically() || + (simdSize != 64 && compOpportunisticallyDependsOn(InstructionSet_AVX10v1))) { op1 = impSIMDPopStack(); retNode = gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_ULONG, simdBaseJitType, simdSize); @@ -1636,7 +1670,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, assert(sig->numArgs == 1); assert(simdBaseType == TYP_DOUBLE); - if (IsBaselineVector512IsaSupportedOpportunistically()) + if (IsBaselineVector512IsaSupportedOpportunistically() || + (simdSize != 64 && compOpportunisticallyDependsOn(InstructionSet_AVX10v1))) { op1 = impSIMDPopStack(); retNode = gtNewSimdCvtNativeNode(retType, op1, CORINFO_TYPE_ULONG, simdBaseJitType, simdSize); @@ -3630,6 +3665,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_AVX512F_FixupScalar: case NI_AVX512F_VL_Fixup: case NI_AVX10v1_Fixup: + case NI_AVX10v1_FixupScalar: { assert(sig->numArgs == 4); diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index 943c75d2639bf1..92ebe7847ecac2 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -5225,6 +5225,10 @@ GenTree* Compiler::impPrimitiveNamedIntrinsic(NamedIntrinsic intrinsic, { hwIntrinsicId = NI_SSE_ConvertToInt32WithTruncation; } + else if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + hwIntrinsicId = NI_AVX10v1_ConvertToUInt32WithTruncation; + } else if (IsBaselineVector512IsaSupportedOpportunistically()) { hwIntrinsicId = NI_AVX512F_ConvertToUInt32WithTruncation; @@ -5238,6 +5242,10 @@ GenTree* Compiler::impPrimitiveNamedIntrinsic(NamedIntrinsic intrinsic, { hwIntrinsicId = NI_SSE2_ConvertToInt32WithTruncation; } + else if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + hwIntrinsicId = NI_AVX10v1_ConvertToUInt32WithTruncation; + } else if (IsBaselineVector512IsaSupportedOpportunistically()) { hwIntrinsicId = NI_AVX512F_ConvertToUInt32WithTruncation; @@ -8784,7 +8792,12 @@ GenTree* Compiler::impEstimateIntrinsic(CORINFO_METHOD_HANDLE method, case NI_System_Math_ReciprocalEstimate: { #if defined(TARGET_XARCH) - if (compExactlyDependsOn(InstructionSet_AVX512F)) + if (compExactlyDependsOn(InstructionSet_AVX10v1)) + { + simdType = TYP_SIMD16; + intrinsicId = NI_AVX10v1_Reciprocal14Scalar; + } + else if (compExactlyDependsOn(InstructionSet_AVX512F)) { simdType = TYP_SIMD16; intrinsicId = NI_AVX512F_Reciprocal14Scalar; @@ -9234,7 +9247,167 @@ GenTree* Compiler::impMinMaxIntrinsic(CORINFO_METHOD_HANDLE method, } #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH) - if (compOpportunisticallyDependsOn(InstructionSet_AVX512DQ)) + if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + // We are constructing a chain of intrinsics similar to: + // var op1 = Vector128.CreateScalarUnsafe(x); + // var op2 = Vector128.CreateScalarUnsafe(y); + // + // var tmp = Avx10v1.RangeScalar(op1, op2, imm8); + // var tbl = Vector128.CreateScalarUnsafe(0x00); + // + // tmp = Avx10v1.FixupScalar(tmp, op2, tbl, 0x00); + // tmp = Avx10v1.FixupScalar(tmp, op1, tbl, 0x00); + // + // return tmp.ToScalar(); + + // RangeScalar operates by default almost as MaxNumber or MinNumber + // but, it propagates sNaN and does not propagate qNaN. So we need + // an additional fixup to ensure we propagate qNaN as well. + + uint8_t imm8; + + if (isMax) + { + if (isMagnitude) + { + // 0b01_11: Sign(CompareResult), Max-Abs Value + imm8 = 0x07; + } + else + { + // 0b01_01: Sign(CompareResult), Max Value + imm8 = 0x05; + } + } + else if (isMagnitude) + { + // 0b01_10: Sign(CompareResult), Min-Abs Value + imm8 = 0x06; + } + else + { + // 0b01_00: Sign(CompareResult), Min Value + imm8 = 0x04; + } + + GenTree* op3 = gtNewIconNode(imm8); + GenTree* op2 = gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, impPopStack().val, callJitType, 16); + GenTree* op1 = gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, impPopStack().val, callJitType, 16); + + GenTree* op2Clone; + op2 = impCloneExpr(op2, &op2Clone, CHECK_SPILL_ALL, nullptr DEBUGARG("Cloning op2 for Math.Max/Min")); + + GenTree* op1Clone; + op1 = impCloneExpr(op1, &op1Clone, CHECK_SPILL_ALL, nullptr DEBUGARG("Cloning op1 for Math.Max/Min")); + + GenTree* tmp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, op3, NI_AVX10v1_RangeScalar, callJitType, 16); + + // FixupScalar(left, right, table, control) computes the input type of right + // adjusts it based on the table and then returns + // + // In our case, left is going to be the result of the RangeScalar operation, + // which is either sNaN or a normal value, and right is going to be op1 or op2. + + GenTree* tbl1 = gtNewVconNode(TYP_SIMD16); + GenTree* tbl2; + + // We currently have (commutative) + // * snan, snan = snan + // * snan, qnan = snan + // * snan, norm = snan + // * qnan, qnan = qnan + // * qnan, norm = norm + // * norm, norm = norm + + if (isNumber) + { + // We need to fixup the case of: + // * snan, norm = snan + // + // Instead, it should be: + // * snan, norm = norm + + // First look at op1 and op2 using op2 as the classification + // + // If op2 is norm, we take op2 (norm) + // If op2 is nan, we take op1 ( nan or norm) + // + // Thus, if one input was norm the fixup is now norm + + // QNAN: 0b0000: Preserve left + // SNAN: 0b0000 + // ZERO: 0b0001: Preserve right + // +ONE: 0b0001 + // -INF: 0b0001 + // +INF: 0b0001 + // -VAL: 0b0001 + // +VAL: 0b0001 + tbl1->AsVecCon()->gtSimdVal.i32[0] = 0x11111100; + + // Next look at result and fixup using result as the classification + // + // If result is norm, we take the result (norm) + // If result is nan, we take the fixup ( nan or norm) + // + // Thus if either input was snan, we now have norm as expected + // Otherwise, the result was already correct + + tbl1 = impCloneExpr(tbl1, &tbl2, CHECK_SPILL_ALL, nullptr DEBUGARG("Cloning tbl for Math.Max/Min")); + + op1Clone = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Clone, op2Clone, tbl1, gtNewIconNode(0), + NI_AVX10v1_FixupScalar, callJitType, 16); + + tmp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Clone, tmp, tbl2, gtNewIconNode(0), NI_AVX10v1_FixupScalar, + callJitType, 16); + } + else + { + // We need to fixup the case of: + // * qnan, norm = norm + // + // Instead, it should be: + // * qnan, norm = qnan + + // First look at op1 and op2 using op2 as the classification + // + // If op2 is norm, we take op1 ( nan or norm) + // If op2 is snan, we take op1 ( nan or norm) + // If op2 is qnan, we take op2 (qnan) + // + // Thus, if either input was qnan the fixup is now qnan + + // QNAN: 0b0001: Preserve right + // SNAN: 0b0000: Preserve left + // ZERO: 0b0000 + // +ONE: 0b0000 + // -INF: 0b0000 + // +INF: 0b0000 + // -VAL: 0b0000 + // +VAL: 0b0000 + tbl1->AsVecCon()->gtSimdVal.i32[0] = 0x00000001; + + // Next look at result and fixup using fixup as the classification + // + // If fixup is norm, we take the result (norm) + // If fixup is sNaN, we take the result (sNaN) + // If fixup is qNaN, we take the fixup (qNaN) + // + // Thus if the fixup was qnan, we now have qnan as expected + // Otherwise, the result was already correct + + tbl1 = impCloneExpr(tbl1, &tbl2, CHECK_SPILL_ALL, nullptr DEBUGARG("Cloning tbl for Math.Max/Min")); + + op1Clone = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Clone, op2Clone, tbl1, gtNewIconNode(0), + NI_AVX10v1_FixupScalar, callJitType, 16); + + tmp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp, op1Clone, tbl2, gtNewIconNode(0), NI_AVX10v1_FixupScalar, + callJitType, 16); + } + + return gtNewSimdToScalarNode(callType, tmp, callJitType, 16); + } + else if (compOpportunisticallyDependsOn(InstructionSet_AVX512DQ)) { // We are constructing a chain of intrinsics similar to: // var op1 = Vector128.CreateScalarUnsafe(x); diff --git a/src/coreclr/jit/importervectorization.cpp b/src/coreclr/jit/importervectorization.cpp index dddc14dec3b9b2..13ac575bdfd2e9 100644 --- a/src/coreclr/jit/importervectorization.cpp +++ b/src/coreclr/jit/importervectorization.cpp @@ -161,7 +161,8 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD( GenTreeVecCon* toLowerVec2 = gtNewVconNode(simdType, (BYTE*)toLowerMask + byteLen - simdSize); #if defined(TARGET_XARCH) - if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) + if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL) || + compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) { GenTree* control; @@ -185,7 +186,8 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD( // ((v1 ^ cns1) | (v2 ^ cns2)) == zero #if defined(TARGET_XARCH) - if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) + if (compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL) || + compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) { GenTree* control; diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index f8fde5bafb3827..91eeb870998522 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -892,7 +892,105 @@ GenTree* Lowering::LowerCast(GenTree* tree) Vector128.Create(long.MaxValue) ); */ - if (comp->IsBaselineVector512IsaSupportedOpportunistically()) + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + // Clone the cast operand for usage. + GenTree* op1Clone1 = comp->gtClone(castOp); + BlockRange().InsertAfter(castOp, op1Clone1); + + // Generate the control table for VFIXUPIMMSD + // The behavior we want is to saturate negative values to 0. + GenTreeVecCon* tbl = comp->gtNewVconNode(TYP_SIMD16); + tbl->gtSimdVal.i32[0] = (varTypeIsUnsigned(dstType)) ? 0x08080088 : 0x00000088; + BlockRange().InsertAfter(op1Clone1, tbl); + + // get a zero int node for control table + GenTree* ctrlByte = comp->gtNewIconNode(0); + BlockRange().InsertAfter(tbl, ctrlByte); + + if (varTypeIsUnsigned(dstType)) + { + // run vfixupimmsd base on table and no flags reporting + GenTree* oper1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, castOp, op1Clone1, tbl, ctrlByte, + NI_AVX10v1_FixupScalar, fieldType, 16); + BlockRange().InsertAfter(ctrlByte, oper1); + LowerNode(oper1); + + // Convert to scalar + // Here, we try to insert a Vector128 to Scalar node so that the input + // can be provided to the scalar cast + GenTree* oper2 = comp->gtNewSimdHWIntrinsicNode(srcType, oper1, NI_Vector128_ToScalar, fieldType, 16); + BlockRange().InsertAfter(oper1, oper2); + LowerNode(oper2); + + castOutput = comp->gtNewCastNode(genActualType(dstType), oper2, false, dstType); + BlockRange().InsertAfter(oper2, castOutput); + } + else + { + CorInfoType destFieldType = (dstType == TYP_INT) ? CORINFO_TYPE_INT : CORINFO_TYPE_LONG; + + ssize_t actualMaxVal = (dstType == TYP_INT) ? INT32_MAX : INT64_MAX; + + // run vfixupimmsd base on table and no flags reporting + GenTree* fixupVal = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, castOp, op1Clone1, tbl, ctrlByte, + NI_AVX10v1_FixupScalar, fieldType, 16); + BlockRange().InsertAfter(ctrlByte, fixupVal); + LowerNode(fixupVal); + + // get the max value vector + GenTree* maxValScalar = (srcType == TYP_DOUBLE) + ? comp->gtNewDconNodeD(static_cast(actualMaxVal)) + : comp->gtNewDconNodeF(static_cast(actualMaxVal)); + GenTree* maxVal = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, maxValScalar, fieldType, 16); + BlockRange().InsertAfter(fixupVal, maxVal); + + GenTree* maxValDstTypeScalar = (dstType == TYP_INT) ? comp->gtNewIconNode(actualMaxVal, dstType) + : comp->gtNewLconNode(actualMaxVal); + GenTree* maxValDstType = + comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, maxValDstTypeScalar, destFieldType, 16); + BlockRange().InsertAfter(maxVal, maxValDstType); + + // usage 1 --> compare with max value of integer + GenTree* compMask = comp->gtNewSimdCmpOpNode(GT_GE, TYP_SIMD16, fixupVal, maxVal, fieldType, 16); + BlockRange().InsertAfter(maxValDstType, compMask); + + // convert fixupVal to local variable and clone it for further use + LIR::Use fixupValUse(BlockRange(), &(compMask->AsHWIntrinsic()->Op(1)), compMask); + ReplaceWithLclVar(fixupValUse); + fixupVal = compMask->AsHWIntrinsic()->Op(1); + GenTree* fixupValClone = comp->gtClone(fixupVal); + LowerNode(compMask); + BlockRange().InsertAfter(fixupVal, fixupValClone); + + GenTree* FixupValCloneScalar = + comp->gtNewSimdHWIntrinsicNode(srcType, fixupValClone, NI_Vector128_ToScalar, fieldType, 16); + BlockRange().InsertAfter(compMask, FixupValCloneScalar); + LowerNode(FixupValCloneScalar); + + // cast it + GenTreeCast* newCast = comp->gtNewCastNode(dstType, FixupValCloneScalar, false, dstType); + BlockRange().InsertAfter(FixupValCloneScalar, newCast); + + GenTree* newTree = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, newCast, destFieldType, 16); + BlockRange().InsertAfter(newCast, newTree); + LowerNode(newTree); + + // usage 2 --> use thecompared mask with input value and max value to blend + GenTree* control = comp->gtNewIconNode(0xCA); // (B & A) | (C & ~A) + BlockRange().InsertAfter(newTree, control); + GenTree* cndSelect = comp->gtNewSimdTernaryLogicNode(TYP_SIMD16, compMask, maxValDstType, newTree, + control, destFieldType, 16); + BlockRange().InsertAfter(control, cndSelect); + LowerNode(cndSelect); + + castOutput = + comp->gtNewSimdHWIntrinsicNode(dstType, cndSelect, NI_Vector128_ToScalar, destFieldType, 16); + BlockRange().InsertAfter(cndSelect, castOutput); + LowerNode(castOutput); + } + } + else if (comp->IsBaselineVector512IsaSupportedOpportunistically()) { // Clone the cast operand for usage. GenTree* op1Clone1 = comp->gtClone(castOp); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 4bbab4e558799b..967e8d3b6c36f0 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -382,6 +382,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) // double -> int/uint/long for SSE41 // For all other conversions, we use helper functions. if (compOpportunisticallyDependsOn(InstructionSet_AVX512F) || + compOpportunisticallyDependsOn(InstructionSet_AVX10v1) || ((dstType != TYP_ULONG) && compOpportunisticallyDependsOn(InstructionSet_SSE41))) { if (tree->CastOp() != oper) diff --git a/src/coreclr/jit/simdashwintrinsic.cpp b/src/coreclr/jit/simdashwintrinsic.cpp index c9b227440d4e5d..88fd6a9ab79222 100644 --- a/src/coreclr/jit/simdashwintrinsic.cpp +++ b/src/coreclr/jit/simdashwintrinsic.cpp @@ -516,7 +516,8 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, case NI_VectorT_ConvertToDouble: { - if (IsBaselineVector512IsaSupportedOpportunistically()) + if (IsBaselineVector512IsaSupportedOpportunistically() || + ((simdSize != 64) && compOpportunisticallyDependsOn(InstructionSet_AVX10v1))) { break; } @@ -539,7 +540,8 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, case NI_VectorT_ConvertToUInt64: case NI_VectorT_ConvertToUInt64Native: { - if (IsBaselineVector512IsaSupportedOpportunistically()) + if (IsBaselineVector512IsaSupportedOpportunistically() || + (simdSize != 64 && compOpportunisticallyDependsOn(InstructionSet_AVX10v1))) { break; } @@ -549,7 +551,9 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, case NI_VectorT_ConvertToSingle: { if ((simdBaseType == TYP_INT) || - (simdBaseType == TYP_UINT && IsBaselineVector512IsaSupportedOpportunistically())) + (simdBaseType == TYP_UINT && + (IsBaselineVector512IsaSupportedOpportunistically() || + (simdSize != 64 && compOpportunisticallyDependsOn(InstructionSet_AVX10v1))))) { break; } @@ -1183,18 +1187,33 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, assert(sig->numArgs == 1); assert(varTypeIsLong(simdBaseType)); NamedIntrinsic intrinsic = NI_Illegal; - if (simdSize == 64) + if (compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) { - intrinsic = NI_AVX512DQ_ConvertToVector512Double; - } - else if (simdSize == 32) - { - intrinsic = NI_AVX512DQ_VL_ConvertToVector256Double; + if (simdSize == 32) + { + intrinsic = NI_AVX10v1_ConvertToVector256Double; + } + else + { + assert(simdSize == 16); + intrinsic = NI_AVX10v1_ConvertToVector128Double; + } } else { - assert(simdSize == 16); - intrinsic = NI_AVX512DQ_VL_ConvertToVector128Double; + if (simdSize == 64) + { + intrinsic = NI_AVX512DQ_ConvertToVector512Double; + } + else if (simdSize == 32) + { + intrinsic = NI_AVX512DQ_VL_ConvertToVector256Double; + } + else + { + assert(simdSize == 16); + intrinsic = NI_AVX512DQ_VL_ConvertToVector128Double; + } } return gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); } @@ -1220,6 +1239,21 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, unreached(); } } + else if (simdBaseType == TYP_UINT && simdSize != 64 && + compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + switch (simdSize) + { + case 16: + intrinsic = NI_AVX10v1_ConvertToVector128Single; + break; + case 32: + intrinsic = NI_AVX10v1_ConvertToVector256Single; + break; + default: + unreached(); + } + } else if (simdBaseType == TYP_UINT) { switch (simdSize) diff --git a/src/tests/Common/GenerateHWIntrinsicTests/GenerateHWIntrinsicTests_X86.cs b/src/tests/Common/GenerateHWIntrinsicTests/GenerateHWIntrinsicTests_X86.cs index 406a3a93b6bd4d..3e5506bbb88690 100644 --- a/src/tests/Common/GenerateHWIntrinsicTests/GenerateHWIntrinsicTests_X86.cs +++ b/src/tests/Common/GenerateHWIntrinsicTests/GenerateHWIntrinsicTests_X86.cs @@ -3871,22 +3871,22 @@ (string templateFileName, Dictionary templateData)[] Avx10v1_V512_Avx512VbmiInputs = new [] { - ("SimpleBinOpTest.template", new Dictionary { ["Isa"] = "Avx512Vbmi", ["LoadIsa"] = "Avx512F", ["Method"] = "PermuteVar64x8", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "Byte", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "Byte", ["Op2VectorType"] = "Vector512", ["Op2BaseType"] = "Byte", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetByte()", ["NextValueOp2"] = "TestLibrary.Generator.GetByte()", ["ValidateFirstResult"] = "result[0] != left[(right[0] & 63)]", ["ValidateRemainingResults"] = "result[i] != left[(right[i] & 63)]"}), - ("SimpleBinOpTest.template", new Dictionary { ["Isa"] = "Avx512Vbmi", ["LoadIsa"] = "Avx512F", ["Method"] = "PermuteVar64x8", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "SByte", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "SByte", ["Op2VectorType"] = "Vector512", ["Op2BaseType"] = "SByte", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetSByte()", ["NextValueOp2"] = "TestLibrary.Generator.GetSByte()", ["ValidateFirstResult"] = "result[0] != left[(right[0] & 63)]", ["ValidateRemainingResults"] = "result[i] != left[(right[i] & 63)]"}), - ("SimpleTernOpTest.template", new Dictionary { ["Isa"] = "Avx512Vbmi", ["LoadIsa"] = "Avx512F", ["Method"] = "PermuteVar64x8x2", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "Byte", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "Byte", ["Op2VectorType"] = "Vector512", ["Op2BaseType"] = "Byte", ["Op3VectorType"] = "Vector512", ["Op3BaseType"] = "Byte", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetByte()", ["NextValueOp2"] = "TestLibrary.Generator.GetByte()", ["NextValueOp3"] = "TestLibrary.Generator.GetByte()", ["ValidateFirstResult"] = "result[0] != (((secondOp[0] & 127) <= 63) ? firstOp[secondOp[0] & 63] : thirdOp[secondOp[0] & 63])", ["ValidateRemainingResults"] = "result[i] != (((secondOp[i] & 127) <= 63) ? firstOp[secondOp[i] & 63] : thirdOp[secondOp[i] & 63])"}), - ("SimpleTernOpTest.template", new Dictionary { ["Isa"] = "Avx512Vbmi", ["LoadIsa"] = "Avx512F", ["Method"] = "PermuteVar64x8x2", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "SByte", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "SByte", ["Op2VectorType"] = "Vector512", ["Op2BaseType"] = "SByte", ["Op3VectorType"] = "Vector512", ["Op3BaseType"] = "SByte", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetSByte()", ["NextValueOp2"] = "TestLibrary.Generator.GetSByte()", ["NextValueOp3"] = "TestLibrary.Generator.GetSByte()", ["ValidateFirstResult"] = "result[0] != (((secondOp[0] & 127) <= 63) ? firstOp[secondOp[0] & 63] : thirdOp[secondOp[0] & 63])", ["ValidateRemainingResults"] = "result[i] != (((secondOp[i] & 127) <= 63) ? firstOp[secondOp[i] & 63] : thirdOp[secondOp[i] & 63])"}), + ("SimpleBinOpTest.template", new Dictionary { ["Isa"] = "Avx10v1.V512", ["LoadIsa"] = "Avx10v1.V512", ["Method"] = "PermuteVar64x8", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "Byte", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "Byte", ["Op2VectorType"] = "Vector512", ["Op2BaseType"] = "Byte", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetByte()", ["NextValueOp2"] = "TestLibrary.Generator.GetByte()", ["ValidateFirstResult"] = "result[0] != left[(right[0] & 63)]", ["ValidateRemainingResults"] = "result[i] != left[(right[i] & 63)]"}), + ("SimpleBinOpTest.template", new Dictionary { ["Isa"] = "Avx10v1.V512", ["LoadIsa"] = "Avx10v1.V512", ["Method"] = "PermuteVar64x8", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "SByte", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "SByte", ["Op2VectorType"] = "Vector512", ["Op2BaseType"] = "SByte", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetSByte()", ["NextValueOp2"] = "TestLibrary.Generator.GetSByte()", ["ValidateFirstResult"] = "result[0] != left[(right[0] & 63)]", ["ValidateRemainingResults"] = "result[i] != left[(right[i] & 63)]"}), + ("SimpleTernOpTest.template", new Dictionary { ["Isa"] = "Avx10v1.V512", ["LoadIsa"] = "Avx10v1.V512", ["Method"] = "PermuteVar64x8x2", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "Byte", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "Byte", ["Op2VectorType"] = "Vector512", ["Op2BaseType"] = "Byte", ["Op3VectorType"] = "Vector512", ["Op3BaseType"] = "Byte", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetByte()", ["NextValueOp2"] = "TestLibrary.Generator.GetByte()", ["NextValueOp3"] = "TestLibrary.Generator.GetByte()", ["ValidateFirstResult"] = "result[0] != (((secondOp[0] & 127) <= 63) ? firstOp[secondOp[0] & 63] : thirdOp[secondOp[0] & 63])", ["ValidateRemainingResults"] = "result[i] != (((secondOp[i] & 127) <= 63) ? firstOp[secondOp[i] & 63] : thirdOp[secondOp[i] & 63])"}), + ("SimpleTernOpTest.template", new Dictionary { ["Isa"] = "Avx10v1.V512", ["LoadIsa"] = "Avx10v1.V512", ["Method"] = "PermuteVar64x8x2", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "SByte", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "SByte", ["Op2VectorType"] = "Vector512", ["Op2BaseType"] = "SByte", ["Op3VectorType"] = "Vector512", ["Op3BaseType"] = "SByte", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetSByte()", ["NextValueOp2"] = "TestLibrary.Generator.GetSByte()", ["NextValueOp3"] = "TestLibrary.Generator.GetSByte()", ["ValidateFirstResult"] = "result[0] != (((secondOp[0] & 127) <= 63) ? firstOp[secondOp[0] & 63] : thirdOp[secondOp[0] & 63])", ["ValidateRemainingResults"] = "result[i] != (((secondOp[i] & 127) <= 63) ? firstOp[secondOp[i] & 63] : thirdOp[secondOp[i] & 63])"}), }; (string templateFileName, Dictionary templateData)[] Avx10v1_V512_Avx512CDInputs = new [] { - ("SimpleUnOpTest.template", new Dictionary { ["Isa"] = "Avx512CD", ["LoadIsa"] = "Avx512F", ["Method"] = "DetectConflicts", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "Int32", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "Int32", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetInt32()", ["ValidateFirstResult"] = "result[0] != Avx512Verify.DetectConflicts(firstOp, 0)", ["ValidateRemainingResults"] = "result[i] != Avx512Verify.DetectConflicts(firstOp, i)"}), - ("SimpleUnOpTest.template", new Dictionary { ["Isa"] = "Avx512CD", ["LoadIsa"] = "Avx512F", ["Method"] = "DetectConflicts", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "Int64", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "Int64", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetInt64()", ["ValidateFirstResult"] = "result[0] != Avx512Verify.DetectConflicts(firstOp, 0)", ["ValidateRemainingResults"] = "result[i] != Avx512Verify.DetectConflicts(firstOp, i)"}), - ("SimpleUnOpTest.template", new Dictionary { ["Isa"] = "Avx512CD", ["LoadIsa"] = "Avx512F", ["Method"] = "DetectConflicts", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "UInt32", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "UInt32", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetUInt32()", ["ValidateFirstResult"] = "result[0] != Avx512Verify.DetectConflicts(firstOp, 0)", ["ValidateRemainingResults"] = "result[i] != Avx512Verify.DetectConflicts(firstOp, i)"}), - ("SimpleUnOpTest.template", new Dictionary { ["Isa"] = "Avx512CD", ["LoadIsa"] = "Avx512F", ["Method"] = "DetectConflicts", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "UInt64", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "UInt64", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetUInt64()", ["ValidateFirstResult"] = "result[0] != Avx512Verify.DetectConflicts(firstOp, 0)", ["ValidateRemainingResults"] = "result[i] != Avx512Verify.DetectConflicts(firstOp, i)"}), - ("SimpleUnOpTest.template", new Dictionary { ["Isa"] = "Avx512CD", ["LoadIsa"] = "Avx512F", ["Method"] = "LeadingZeroCount", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "Int32", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "Int32", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetInt32()", ["ValidateFirstResult"] = "result[0] != int.LeadingZeroCount(firstOp[0])", ["ValidateRemainingResults"] = "result[i] != int.LeadingZeroCount(firstOp[i])"}), - ("SimpleUnOpTest.template", new Dictionary { ["Isa"] = "Avx512CD", ["LoadIsa"] = "Avx512F", ["Method"] = "LeadingZeroCount", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "Int64", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "Int64", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetInt64()", ["ValidateFirstResult"] = "result[0] != long.LeadingZeroCount(firstOp[0])", ["ValidateRemainingResults"] = "result[i] != long.LeadingZeroCount(firstOp[i])"}), - ("SimpleUnOpTest.template", new Dictionary { ["Isa"] = "Avx512CD", ["LoadIsa"] = "Avx512F", ["Method"] = "LeadingZeroCount", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "UInt32", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "UInt32", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetUInt32()", ["ValidateFirstResult"] = "result[0] != uint.LeadingZeroCount(firstOp[0])", ["ValidateRemainingResults"] = "result[i] != uint.LeadingZeroCount(firstOp[i])"}), - ("SimpleUnOpTest.template", new Dictionary { ["Isa"] = "Avx512CD", ["LoadIsa"] = "Avx512F", ["Method"] = "LeadingZeroCount", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "UInt64", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "UInt64", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetUInt64()", ["ValidateFirstResult"] = "result[0] != ulong.LeadingZeroCount(firstOp[0])", ["ValidateRemainingResults"] = "result[i] != ulong.LeadingZeroCount(firstOp[i])"}), + ("SimpleUnOpTest.template", new Dictionary { ["Isa"] = "Avx10v1.V512", ["LoadIsa"] = "Avx10v1.V512", ["Method"] = "DetectConflicts", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "Int32", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "Int32", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetInt32()", ["ValidateFirstResult"] = "result[0] != Avx512Verify.DetectConflicts(firstOp, 0)", ["ValidateRemainingResults"] = "result[i] != Avx512Verify.DetectConflicts(firstOp, i)"}), + ("SimpleUnOpTest.template", new Dictionary { ["Isa"] = "Avx10v1.V512", ["LoadIsa"] = "Avx10v1.V512", ["Method"] = "DetectConflicts", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "Int64", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "Int64", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetInt64()", ["ValidateFirstResult"] = "result[0] != Avx512Verify.DetectConflicts(firstOp, 0)", ["ValidateRemainingResults"] = "result[i] != Avx512Verify.DetectConflicts(firstOp, i)"}), + ("SimpleUnOpTest.template", new Dictionary { ["Isa"] = "Avx10v1.V512", ["LoadIsa"] = "Avx10v1.V512", ["Method"] = "DetectConflicts", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "UInt32", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "UInt32", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetUInt32()", ["ValidateFirstResult"] = "result[0] != Avx512Verify.DetectConflicts(firstOp, 0)", ["ValidateRemainingResults"] = "result[i] != Avx512Verify.DetectConflicts(firstOp, i)"}), + ("SimpleUnOpTest.template", new Dictionary { ["Isa"] = "Avx10v1.V512", ["LoadIsa"] = "Avx10v1.V512", ["Method"] = "DetectConflicts", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "UInt64", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "UInt64", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetUInt64()", ["ValidateFirstResult"] = "result[0] != Avx512Verify.DetectConflicts(firstOp, 0)", ["ValidateRemainingResults"] = "result[i] != Avx512Verify.DetectConflicts(firstOp, i)"}), + ("SimpleUnOpTest.template", new Dictionary { ["Isa"] = "Avx10v1.V512", ["LoadIsa"] = "Avx10v1.V512", ["Method"] = "LeadingZeroCount", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "Int32", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "Int32", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetInt32()", ["ValidateFirstResult"] = "result[0] != int.LeadingZeroCount(firstOp[0])", ["ValidateRemainingResults"] = "result[i] != int.LeadingZeroCount(firstOp[i])"}), + ("SimpleUnOpTest.template", new Dictionary { ["Isa"] = "Avx10v1.V512", ["LoadIsa"] = "Avx10v1.V512", ["Method"] = "LeadingZeroCount", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "Int64", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "Int64", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetInt64()", ["ValidateFirstResult"] = "result[0] != long.LeadingZeroCount(firstOp[0])", ["ValidateRemainingResults"] = "result[i] != long.LeadingZeroCount(firstOp[i])"}), + ("SimpleUnOpTest.template", new Dictionary { ["Isa"] = "Avx10v1.V512", ["LoadIsa"] = "Avx10v1.V512", ["Method"] = "LeadingZeroCount", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "UInt32", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "UInt32", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetUInt32()", ["ValidateFirstResult"] = "result[0] != uint.LeadingZeroCount(firstOp[0])", ["ValidateRemainingResults"] = "result[i] != uint.LeadingZeroCount(firstOp[i])"}), + ("SimpleUnOpTest.template", new Dictionary { ["Isa"] = "Avx10v1.V512", ["LoadIsa"] = "Avx10v1.V512", ["Method"] = "LeadingZeroCount", ["RetVectorType"] = "Vector512", ["RetBaseType"] = "UInt64", ["Op1VectorType"] = "Vector512", ["Op1BaseType"] = "UInt64", ["LargestVectorSize"] = "64", ["NextValueOp1"] = "TestLibrary.Generator.GetUInt64()", ["ValidateFirstResult"] = "result[0] != ulong.LeadingZeroCount(firstOp[0])", ["ValidateRemainingResults"] = "result[i] != ulong.LeadingZeroCount(firstOp[i])"}), }; (string templateFileName, Dictionary templateData)[] Fma_Vector128Inputs = new []