diff --git a/CMakeLists.txt b/CMakeLists.txt index 72dad78a..3ded27c3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.5) -project(xbyak LANGUAGES CXX VERSION 7.10) +project(xbyak LANGUAGES CXX VERSION 7.20) file(GLOB headers xbyak/*.h) diff --git a/doc/changelog.md b/doc/changelog.md index 5e25c2dd..1461f6ed 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -1,5 +1,7 @@ # History +* 2024/Oct/15 ver 7.20 Fixed the specification of setDefaultEncoding, setDefaultEncodingAVX10./ +* 2024/Oct/15 ver 7.11 Added full support for AVX10.2 * 2024/Oct/13 ver 7.10 support AVX10 integer and fp16 vnni, media new instructions. setDefaultEncoding is extended. * 2024/Oct/10 ver 7.09.1 fix the names of vpcompressb and vpcompressw * 2024/Oct/08 ver 7.09 support YMM embedded rounding of AVX10.2 and fix some mnemonics with {sae}/{er}. diff --git a/doc/usage.md b/doc/usage.md index 5b255130..dcb3e101 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -1,7 +1,7 @@ # Usage Inherit `Xbyak::CodeGenerator` class and make the class method. -``` +```cpp #include struct Code : Xbyak::CodeGenerator { @@ -13,7 +13,7 @@ struct Code : Xbyak::CodeGenerator { }; ``` Or you can pass the instance of CodeGenerator without inheriting. -``` +```cpp void genCode(Xbyak::CodeGenerator& code, int x) { using namespace Xbyak::util; code.mov(eax, x); @@ -23,7 +23,7 @@ void genCode(Xbyak::CodeGenerator& code, int x) { Make an instance of the class and get the function pointer by calling `getCode()` and call it. -``` +```cpp Code c(5); int (*f)() = c.getCode(); printf("ret=%d\n", f()); // ret = 5 @@ -32,7 +32,7 @@ printf("ret=%d\n", f()); // ret = 5 ## Syntax Similar to MASM/NASM syntax with parentheses. -``` +```cpp NASM Xbyak mov eax, ebx --> mov(eax, ebx); inc ecx inc(ecx); @@ -43,7 +43,7 @@ ret --> ret(); Use `qword`, `dword`, `word` and `byte` if it is necessary to specify the size of memory, otherwise use `ptr`. -``` +```cpp (ptr|qword|dword|word|byte) [base + index * (1|2|4|8) + displacement] [rip + 32bit disp] ; x64 only @@ -53,19 +53,21 @@ mov al, [ebx+ecx] --> mov(al, ptr [ebx + ecx]); test byte [esp], 4 --> test(byte [esp], 4); inc qword [rax] --> inc(qword [rax]); ``` + **Note**: `qword`, ... are member variables, then don't use `dword` as unsigned int type. ### How to use Selector (Segment Register) -``` +```cpp mov eax, [fs:eax] --> putSeg(fs); mov(eax, ptr [eax]); mov ax, cs --> mov(ax, cs); ``` + **Note**: Segment class is not derived from `Operand`. ## AVX -``` +```cpp vaddps(xmm1, xmm2, xmm3); // xmm1 <- xmm2 + xmm3 vaddps(xmm2, xmm3, ptr [rax]); // use ptr to access memory vgatherdpd(xmm1, ptr [ebp + 256 + xmm2*4], xmm3); @@ -74,13 +76,13 @@ vgatherdpd(xmm1, ptr [ebp + 256 + xmm2*4], xmm3); **Note**: If `XBYAK_ENABLE_OMITTED_OPERAND` is defined, then you can use two operand version for backward compatibility. But the newer version will not support it. -``` +```cpp vaddps(xmm2, xmm3); // xmm2 <- xmm2 + xmm3 ``` ## AVX-512 -``` +```cpp vaddpd zmm2, zmm5, zmm30 --> vaddpd(zmm2, zmm5, zmm30); vaddpd xmm30, xmm20, [rax] --> vaddpd(xmm30, xmm20, ptr [rax]); vaddps xmm30, xmm20, [rax] --> vaddps(xmm30, xmm20, ptr [rax]); @@ -108,35 +110,44 @@ vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit ``` -## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8 etc. -Some mnemonics have two types of encodings: VEX and EVEX. +## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8, AVX10.2. +Some mnemonics have some types of encodings: VEX, EVEX, AVX10.2. The functions for these mnemonics include an optional parameter as the last argument to specify the encoding. -The default behavior depends on the order in which the instruction was introduced (whether VEX or EVEX came first), +The default behavior depends on the order in which the instruction was introduced (whether VEX, EVEX or AVX10.2 came first), and can be specified using setDefaultEncoding. -``` +```cpp vpdpbusd(xm0, xm1, xm2); // default encoding: EVEX (AVX512-VNNI) vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX (AVX-VNNI) -setDefaultEncoding(VexEncoding); // default encoding is VEX +setDefaultEncoding(VexEncoding); // change default encoding vpdpbusd(xm0, xm1, xm2); // VEX -vmpsadbw(xm1, xm3, xm15, 3); // default encoding: VEX (AVX-VNNI) -vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // same as the above -vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // EVEX (AVX10.2) -setDefaultEncoding(VexEncoding, EvexEncoding); // use 2nd argument. -vmpsadbw(xm1, xm3, xm15, 3); // EVEX +vmpsadbw(xm1, xm3, xm15, 3); // default encoding: AVX +vmpsadbw(xm1, xm3, xm15, 3, PreAVX10v2Encoding); // same as the above +vmpsadbw(xm1, xm3, xm15, 3, AVX10v2Encoding); // AVX10.2 +setDefaultEncodingAVX10(AVX10v2Encoding); // change default encoding +vmpsadbw(xm1, xm3, xm15, 3); // AVX10.2 ``` -- `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)` -Control the default encoding of mnemonics with `Xbyak::PreferredEncoding` param. +- `setDefaultEncoding(PreferredEncoding enc = EvexEncoding)` + - Configure encoding for AVX512-VNNI or AVX-VNNI instructions. +- `setDefaultEncodingAVX10(PreferredEncoding enc = PreAVXv2Encoding)` + - Configure encoding for pre-AVX10.2 and AVX10.2 instructions. + +`setDefaultEncoding`|EvexEncoding (default)|VexEncoding +-|-|- +feature|AVX512-VNNI|AVX-VNNI + +- Target functions: vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds -param|vnniEnc|avx10Enc +`setDefaultEncodingAVX10`|PreAVX10v2Encoding (default)|AVX10v2Encoding -|-|- -EvexEncoding|AVX512-VNNI|AVX10.2 -VexEncoding|AVX-VNNI|AVX-VNNI-INT8 -default|EvexEncoding|VexEncoding -mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds +feature|AVX-VNNI-INT8, AVX512-FP16|AVX10.2 + +- Target functions: vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds, vmovd, vmovw + +- Remark: vmovd and vmovw several kinds of encoding such as AVX/AVX512F/AVX512-FP16/AVX10.2. ### Remark * `k1`, ..., `k7` are opmask registers. @@ -179,7 +190,7 @@ mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, Two kinds of Label are supported. (String literal and Label class). ### String literal -``` +```cpp L("L1"); jmp("L1"); @@ -201,7 +212,7 @@ L("L3"); ### Support `@@`, `@f`, `@b` like MASM -``` +```cpp L("@@"); // jmp("@b"); // jmp to jmp("@f"); // jmp to @@ -217,7 +228,7 @@ Label symbols beginning with a period between `inLocalLabel()` and `outLocalLabe are treated as a local label. `inLocalLabel()` and `outLocalLabel()` can be nested. -``` +```cpp void func1() { inLocalLabel(); @@ -240,7 +251,7 @@ void func1() Xbyak deals with jump mnemonics of an undefined label as short jump if no type is specified. So if the size between jmp and label is larger than 127 byte, then xbyak will cause an error. -``` +```cpp jmp("short-jmp"); // short jmp // small code L("short-jmp"); @@ -249,14 +260,16 @@ jmp("long-jmp"); // long code L("long-jmp"); // throw exception ``` + Then specify T_NEAR for jmp. -``` +```cpp jmp("long-jmp", T_NEAR); // long jmp // long code L("long-jmp"); ``` + Or call `setDefaultJmpNEAR(true);` once, then the default type is set to T_NEAR. -``` +```cpp jmp("long-jmp"); // long jmp // long code L("long-jmp"); @@ -266,7 +279,7 @@ L("long-jmp"); `L()` and `jxx()` support Label class. -``` +```cpp Xbyak::Label label1, label2; L(label1); ... @@ -278,7 +291,7 @@ L(label2); ``` Use `putL` for jmp table -``` +```cpp Label labelTbl, L0, L1, L2; mov(rax, labelTbl); // rdx is an index of jump table @@ -295,7 +308,7 @@ L(L1); `assignL(dstLabel, srcLabel)` binds dstLabel with srcLabel. -``` +```cpp Label label2; Label label1 = L(); // make label1 ; same to Label label1; L(label1); ... @@ -310,7 +323,7 @@ The `jmp` in the above code jumps to label1 assigned by `assignL`. * dstLabel must not be used in `L()`. `Label::getAddress()` returns the address specified by the label instance and 0 if not specified. -``` +```cpp // not AutoGrow mode Label label; assert(label.getAddress() == 0); @@ -319,7 +332,7 @@ assert(label.getAddress() == getCurr()); ``` ### Rip ; relative addressing -``` +```cpp Label label; mov(eax, ptr [rip + label]); // eax = 4 ... @@ -327,7 +340,7 @@ mov(eax, ptr [rip + label]); // eax = 4 L(label); dd(4); ``` -``` +```cpp int x; ... mov(eax, ptr[rip + &x]); // throw exception if the difference between &x and current position is larger than 2GiB @@ -338,13 +351,13 @@ int x; Use `word|dword|qword` instead of `ptr` to specify the address size. ### 32 bit mode -``` +```cpp jmp(word[eax], T_FAR); // jmp m16:16(FF /5) jmp(dword[eax], T_FAR); // jmp m16:32(FF /5) ``` ### 64 bit mode -``` +```cpp jmp(word[rax], T_FAR); // jmp m16:16(FF /5) jmp(dword[rax], T_FAR); // jmp m16:32(FF /5) jmp(qword[rax], T_FAR); // jmp m16:64(REX.W FF /5) @@ -355,7 +368,7 @@ The same applies to `call`. The default max code size is 4096 bytes. Specify the size in constructor of `CodeGenerator()` if necessary. -``` +```cpp class Quantize : public Xbyak::CodeGenerator { public: Quantize() @@ -372,7 +385,7 @@ You can make jit code on prepared memory. Call `setProtectModeRE` yourself to change memory mode if using the prepared memory. -``` +```cpp uint8_t alignas(4096) buf[8192]; // C++11 or later struct Code : Xbyak::CodeGenerator { @@ -398,7 +411,7 @@ int main() The memory region for jit is automatically extended if necessary when `AutoGrow` is specified in a constructor of `CodeGenerator`. Call `ready()` or `readyRE()` before calling `getCode()` to fix jump address. -``` +```cpp struct Code : Xbyak::CodeGenerator { Code() : Xbyak::CodeGenerator(, Xbyak::AutoGrow) @@ -419,7 +432,7 @@ Xbyak set Read/Write/Exec mode to memory to run jit code. If you want to use Read/Exec mode for security, then specify `DontSetProtectRWE` for `CodeGenerator` and call `setProtectModeRE()` after generating jit code. -``` +```cpp struct Code : Xbyak::CodeGenerator { Code() : Xbyak::CodeGenerator(4096, Xbyak::DontSetProtectRWE) diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 2b8a3286..e4d319ee 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -209,6 +209,30 @@ void putX_XM() { 0x2E, "vucomxsd", T_MUST_EVEX | T_F3 | T_0F | T_EW1 | T_SAE_X | T_N8 }, { 0x2E, "vucomxsh", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, { 0x2E, "vucomxss", T_MUST_EVEX | T_F2 | T_0F | T_EW0 | T_SAE_X | T_N4 }, + + // 13.1 + { 0x69, "vcvtnebf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 }, + { 0x6B, "vcvtnebf162iubs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 }, + { 0x68, "vcvttnebf162ibs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 }, + { 0x6A, "vcvttnebf162iubs", T_MUST_EVEX | T_YMM | T_F2 | T_MAP5 | T_EW0 | T_B16 }, + // 13.3 + { 0x6D, "vcvttpd2qqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z }, + // 13.5 + { 0x6C, "vcvttpd2uqqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z }, + // 13.6 + { 0x69, "vcvtph2ibs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z }, + { 0x6B, "vcvtph2iubs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z }, + { 0x68, "vcvttph2ibs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z }, + { 0x6A, "vcvttph2iubs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B16 | T_ER_Y | T_ER_Z }, + // 13.7 + { 0x6D, "vcvttps2dqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B32 | T_SAE_Y | T_SAE_Z }, + // 13.8 + { 0x69, "vcvtps2ibs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z }, + { 0x6B, "vcvtps2iubs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z }, + { 0x68, "vcvttps2ibs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z }, + { 0x6A, "vcvttps2iubs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_ER_Y | T_ER_Z }, + // 13.10 + { 0x6C, "vcvttps2udqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW0 | T_B32 | T_SAE_Y | T_SAE_Z }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -240,7 +264,6 @@ void putM_X() { 0x7F, "vmovdqu32", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, { 0x7F, "vmovdqu64", T_F3 | T_0F | T_MUST_EVEX | T_YMM | T_EW1 | T_ER_X | T_ER_Y | T_ER_Z | T_M_K }, { 0x11, "vmovsh", T_F3 | T_MAP5 | T_MUST_EVEX | T_EW0 | T_N2 | T_M_K }, - { 0x7E, "vmovw", T_66 | T_MAP5 | T_MUST_EVEX | T_N2 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -447,6 +470,13 @@ void putX_X_XM_IMM() { 0x1B, "vcvtne2ph2hf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false }, { 0x52, "vdpphps", T_MUST_EVEX | T_0F38 | T_EW0 | T_YMM | T_B32, false }, + { 0x52, "vminmaxnepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true }, + { 0x52, "vminmaxpd", T_MUST_EVEX | T_66 | T_0F3A | T_EW1 | T_YMM | T_B64 | T_SAE_Y | T_SAE_Z, true }, + { 0x52, "vminmaxph", T_MUST_EVEX | T_0F3A | T_EW0 | T_YMM | T_B16 | T_SAE_Y | T_SAE_Z, true }, + { 0x52, "vminmaxps", T_MUST_EVEX | T_66 | T_0F3A | T_EW0 | T_YMM | T_B32 | T_SAE_Y | T_SAE_Z, true }, + { 0x53, "vminmaxsd", T_MUST_EVEX | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_N8, true }, + { 0x53, "vminmaxsh", T_MUST_EVEX | T_0F3A | T_EW0 | T_SAE_X | T_N2, true }, + { 0x53, "vminmaxss", T_MUST_EVEX | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_N4, true }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -658,6 +688,22 @@ void putCvt() { 0x7B, "vcvtusi2sh", T_F3 | T_MAP5 | T_MUST_EVEX | T_ER_R | T_M_K, 6 }, { 0x72, "vcvtneps2bf16", T_MUST_EVEX | T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 2 }, + // 13.2 + { 0x6D, "vcvttpd2dqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z, 2 }, + // 13.4 + { 0x6C, "vcvttpd2udqs", T_MUST_EVEX | T_YMM | T_MAP5 | T_EW1 | T_B64 | T_SAE_Y | T_SAE_Z, 2 }, + // 13.9 + { 0x6D, "vcvttps2qqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_SAE_X | T_SAE_Y | T_N8 | T_N_VL, 1 }, + // 13.11 + { 0x6C, "vcvttps2uqqs", T_MUST_EVEX | T_YMM | T_66 | T_MAP5 | T_EW0 | T_B32 | T_SAE_X | T_SAE_Y | T_N8 | T_N_VL, 1 }, + // 13.12 + { 0x6D, "vcvttsd2sis", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N8, 0 }, + // 13.13 + { 0x6C, "vcvttsd2usis", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N8, 0 }, + // 13.14 + { 0x6D, "vcvttss2sis", T_MUST_EVEX | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_N4, 0 }, + // 13.15 + { 0x6C, "vcvttss2usis", T_MUST_EVEX | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_N4, 0 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl& p = tbl[i]; @@ -666,10 +712,10 @@ void putCvt() case 0: printf("void %s(const Reg32e& r, const Operand& op) { uint64_t type = (%s) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x%02X); }\n", p.name, s.c_str(), p.code); break; - case 1: + case 1: // (x, x/m), (y, x/m256), (z, y/m) printf("void %s(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code); break; - case 2: + case 2: // (x, x/m), (x, y/m256), (y, z/m) printf("void %s(const Xmm& x, const Operand& op) { opCvt2(x, op, %s, 0x%02X); }\n", p.name, s.c_str(), p.code); break; case 3: @@ -1032,12 +1078,6 @@ void putFP16_2() printf("void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, %s, 0x10); }\n", s.c_str()); printf("void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, %s, 0x10); }\n", s.c_str()); } - { - uint64_t type = T_66 | T_MAP5 | T_MUST_EVEX | T_N2; - std::string s = type2String(type); - printf("void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, %s, 0x6E); }\n", s.c_str()); - printf("void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, %s, 0x7E); }\n", s.c_str()); - } } void putFP16() diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index a22c12b2..c2db4ac8 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -1443,6 +1443,7 @@ void put() printf("void %s(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0x%02X, T_MUST_EVEX, 0x%02X); }\n", p->name, p->code, p->code2); } puts("void sha1rnds4(const Xmm& x, const Operand& op, uint8_t imm) { opSSE_APX(x, op, T_0F3A, 0xCC, T_MUST_EVEX, 0xD4, imm); }"); + puts("void sha1msg12(const Xmm& x, const Operand& op) { opROO(Reg(), op, x, T_MUST_EVEX, 0xD9); }"); } // (m, x), (m, y) { @@ -1733,9 +1734,6 @@ void put() } // mov { - printf("void vmovd(const Xmm& x, const Operand& op) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E); }\n"); - printf("void vmovd(const Operand& op, const Xmm& x) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E); }\n"); - printf("void vmovq(const Xmm& x, const Address& addr) { uint64_t type; uint8_t code; if (x.getIdx() < 16) { type = T_0F | T_F3; code = 0x7E; } else { type = T_0F | T_66 | T_EVEX | T_EW1 | T_N8; code = 0x6E; } opAVX_X_X_XM(x, xm0, addr, type, code); }\n"); printf("void vmovq(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, x.getIdx() < 16 ? 0xD6 : 0x7E); }\n"); printf("void vmovq(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_F3 | T_EVEX | T_EW1 | T_N8, 0x7E); }\n"); @@ -1899,36 +1897,6 @@ void put() printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, %s, 0x%02X, encoding); }\n", p->name, s.c_str(), p->code); } } - // avx-vnni-int8 - // avx-vnni-int16 -#if 0 - { - const struct Tbl { - uint8_t code; - const char *name; - uint64_t type; - } tbl[] = { -// { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM }, -// { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM }, -// { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM }, -// { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, -// { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, -// { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, - -// { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM }, -// { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, -// { 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM }, -// { 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM }, -// { 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM }, -// { 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM }, - }; - for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { - const Tbl *p = &tbl[i]; - std::string s = type2String(p->type); - printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, s.c_str(), p->code); - } - } -#endif } void put32() diff --git a/meson.build b/meson.build index 3fb5e511..b69a379b 100644 --- a/meson.build +++ b/meson.build @@ -5,7 +5,7 @@ project( 'xbyak', 'cpp', - version: '7.10', + version: '7.20', license: 'BSD-3-Clause', default_options: 'b_ndebug=if-release' ) diff --git a/readme.md b/readme.md index 49f0a9d7..90d29345 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,5 @@ -# Xbyak 7.10 [![Badge Build]][Build Status] +# Xbyak 7.20 [![Badge Build]][Build Status] *A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)* @@ -20,8 +20,7 @@ It is named from a Japanese word [開闢](https://translate.google.com/?hl=ja&sl - header file only - Intel/MASM like syntax -- fully support AVX-512 -- support APX/AVX10 +- Full support for AVX-512, APX, and AVX10.2 **Note**: Use `and_()`, `or_()`, ... instead of `and()`, `or()`. @@ -33,6 +32,7 @@ If you want to use them, then specify `-fno-operator-names` option to gcc/clang. ### News +- support AVX10.2 - support xresldtrk/xsusldtrk - support RAO-INT for APX - support AVX10 detection, AESKLE, WIDE_KL, KEYLOCKER, KEYLOCKER_WIDE diff --git a/readme.txt b/readme.txt index deabcd8b..65527f39 100644 --- a/readme.txt +++ b/readme.txt @@ -1,5 +1,5 @@ - C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.10 + C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.20 ----------------------------------------------------------------------------- ◎概要 @@ -14,7 +14,7 @@ xbyak.hをインクルードするだけですぐ利用することができます。 C++の枠組み内で閉じているため、外部アセンブラは不要です。 32bit/64bit両対応です。 - 対応ニーモニック:特権命令除くx86, MMX/MMX2/SSE/SSE2/SSE3/SSSE3/SSE4/FPU(一部)/AVX/AVX2/FMA/VEX-encoded GPR + 対応ニーモニック:特権命令除くx86, MMX/MMX2/SSE/SSE2/SSE3/SSSE3/SSE4/FPU(一部)/AVX/AVX2/FMA/AVX-512/APX/AVX10.2 ・Windows Xp(32bit, 64bit), Windows 7/Linux(32bit, 64bit)/Intel Mac対応 Windows Xp, Windows 7上ではVC2008, VC2010, VC2012 @@ -46,7 +46,7 @@ Linuxではmake installで/usr/local/include/xbyakにコピーされます。 ----------------------------------------------------------------------------- ◎新機能 -APX/AVX10対応 +APX/AVX10.2対応 例外なしモード追加 XBYAK_NO_EXCEPTIONを定義してコンパイルするとgcc/clangで-fno-exceptionsオプションでコンパイルできます。 @@ -404,6 +404,9 @@ sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から ----------------------------------------------------------------------------- ◎履歴 +2024/10/15 ver 7.20 setDefaultEncoding/setDefaultEncodingAVX10の仕様確定 +2024/10/15 ver 7.11 AVX10.2完全サポート +2024/10/13 ver 7.10 AVX10 integer and fp16 vnni, mediaの新命令対応. setDefaultEncodingの拡張. 2024/10/10 ver 7.09.1 vpcompressbとvpcompresswの名前修正 2024/10/08 ver 7.09 AVX10.2のYMMレジスタの埋め込み丸め対応 2024/10/07 ver 7.08 rdfabaseなどサポート diff --git a/test/Makefile b/test/Makefile index 336dcaf8..cf5c7163 100644 --- a/test/Makefile +++ b/test/Makefile @@ -60,9 +60,12 @@ apx: apx.cpp $(XBYAK_INC) avx10_test: avx10_test.cpp $(XBYAK_INC) $(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64 -TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt convert.txt +TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt misc.txt convert.txt minmax.txt saturation.txt xed_test: - @for target in $(addprefix avx10/, $(TEST_FILES)); do ./test_by_xed.sh $$target; done + @set -e; \ + for target in $(addprefix avx10/, $(TEST_FILES)); do \ + ./test_by_xed.sh $$target || exit 1; \ + done test_nm: normalize_prefix $(TARGET) $(MAKE) -C ../gen diff --git a/test/avx10/minmax.txt b/test/avx10/minmax.txt new file mode 100644 index 00000000..8b2c662d --- /dev/null +++ b/test/avx10/minmax.txt @@ -0,0 +1,66 @@ +vminmaxnepbf16(xm1|k3|T_z, xm2, xm3, 5); +vminmaxnepbf16(xm1|k3|T_z, xm2, ptr[rax+128], 5); +vminmaxnepbf16(xm1|k3|T_z, xm2, ptr_b[rax+128], 5); + +vminmaxnepbf16(ym1|k3|T_z, ym2, ym3, 5); +vminmaxnepbf16(ym1|k3|T_z, ym2, ptr[rax+128], 5); +vminmaxnepbf16(ym1|k3|T_z, ym2, ptr_b[rax+128], 5); + +vminmaxnepbf16(zm1|k3|T_z, zm2, zm3, 5); +vminmaxnepbf16(zm1|k3|T_z, zm2, ptr[rax+128], 5); +vminmaxnepbf16(zm1|k3|T_z, zm2, ptr_b[rax+128], 5); +// +vminmaxpd(xm1|k3|T_z, xm2, xm3, 5); +vminmaxpd(xm1|k3|T_z, xm2, ptr[rax+128], 5); +vminmaxpd(xm1|k3|T_z, xm2, ptr_b[rax+128], 5); + +vminmaxpd(ym1|k3|T_z, ym2, ym3, 5); +vminmaxpd(ym1|k3|T_z, ym2, ym3|T_sae, 5); +vminmaxpd(ym1|k3|T_z, ym2, ptr[rax+128], 5); +vminmaxpd(ym1|k3|T_z, ym2, ptr_b[rax+128], 5); + +vminmaxpd(zm1|k3|T_z, zm2, zm3, 5); +vminmaxpd(zm1|k3|T_z, zm2, zm3|T_sae, 5); +vminmaxpd(zm1|k3|T_z, zm2, ptr[rax+128], 5); +vminmaxpd(zm1|k3|T_z, zm2, ptr_b[rax+128], 5); +// +vminmaxph(xm1|k3|T_z, xm2, xm3, 5); +vminmaxph(xm1|k3|T_z, xm2, ptr[rax+128], 5); +vminmaxph(xm1|k3|T_z, xm2, ptr[rax+128], 5); +vminmaxph(xm1|k3|T_z, xm2, ptr_b[rax+128], 5); + +vminmaxph(ym1|k3|T_z, ym2, ym3, 5); +vminmaxph(ym1|k3|T_z, ym2, ym3|T_sae, 5); +vminmaxph(ym1|k3|T_z, ym2, ptr[rax+128], 5); +vminmaxph(ym1|k3|T_z, ym2, ptr_b[rax+128], 5); + +vminmaxph(zm1|k3|T_z, zm2, zm3, 5); +vminmaxph(zm1|k3|T_z, zm2, zm3|T_sae, 5); +vminmaxph(zm1|k3|T_z, zm2, ptr[rax+128], 5); +vminmaxph(zm1|k3|T_z, zm2, ptr_b[rax+128], 5); +// +vminmaxps(xm1|k3|T_z, xm2, xm3, 5); +vminmaxps(xm1|k3|T_z, xm2, ptr[rax+128], 5); +vminmaxps(xm1|k3|T_z, xm2, ptr_b[rax+128], 5); + +vminmaxps(ym1|k3|T_z, ym2, ym3, 5); +vminmaxps(ym1|k3|T_z, ym2, ym3|T_sae, 5); +vminmaxps(ym1|k3|T_z, ym2, ptr[rax+128], 5); +vminmaxps(ym1|k3|T_z, ym2, ptr_b[rax+128], 5); + +vminmaxps(zm1|k3|T_z, zm2, zm3, 5); +vminmaxps(zm1|k3|T_z, zm2, zm3|T_sae, 5); +vminmaxps(zm1|k3|T_z, zm2, ptr[rax+128], 5); +vminmaxps(zm1|k3|T_z, zm2, ptr_b[rax+128], 5); +// +vminmaxsd(xm1|k3|T_z, xm2, xm3, 5); +vminmaxsd(xm1|k3|T_z, xm2, xm3|T_sae, 5); +vminmaxsd(xm1|k3|T_z, xm2, ptr[rax+128], 5); +// +vminmaxsh(xm1|k3|T_z, xm2, xm3, 5); +vminmaxsh(xm1|k3|T_z, xm2, xm3|T_sae, 5); +vminmaxsh(xm1|k3|T_z, xm2, ptr[rax+128], 5); +// +vminmaxss(xm1|k3|T_z, xm2, xm3, 5); +vminmaxss(xm1|k3|T_z, xm2, xm3|T_sae, 5); +vminmaxss(xm1|k3|T_z, xm2, ptr[rax+128], 5); diff --git a/test/avx10/misc.txt b/test/avx10/misc.txt index 9464d034..6f5c1562 100644 --- a/test/avx10/misc.txt +++ b/test/avx10/misc.txt @@ -1,3 +1,4 @@ +// AVX10 integer and FP16 VNNI, media and zero-extending vdpphps(xm1, xm2, xm3); vdpphps(xm1, xm2, ptr[rax+128]); vdpphps(xm1, xm2, ptr_b[rax+128]); @@ -165,3 +166,14 @@ vpdpwuuds(ym1, ym2, ptr_b[rax+128]); vpdpwuuds(zm1, zm2, zm3); vpdpwuuds(zm1, zm2, ptr[rax+128]); vpdpwuuds(zm1, zm2, ptr_b[rax+128]); + +// +vmovd(xm10, xm20); +vmovd(xm1, xm2); +vmovd(xm10, ptr[rax+128]); +vmovd(ptr[rax+128], xm30); +// +vmovw(xm1, xm20); +vmovw(xm1, xm2); +vmovw(xm3, ptr [rax+0x40]); +vmovw(ptr [rax+0x40], xm7); diff --git a/test/avx10/old.txt b/test/avx10/old.txt index 9e4f097d..f5a143c9 100644 --- a/test/avx10/old.txt +++ b/test/avx10/old.txt @@ -355,10 +355,6 @@ vgetmantsh(xmm1|k1|T_z|T_sae, xmm3, xmm5, 0x6); vmovsh(xmm1|k1|T_z, ptr [rax+0x40]); vmovsh(ptr [rax+0x40]|k1, xmm1); vmovsh(xmm1|k2|T_z, xmm3, xmm5); -vmovw(xmm1, r13d); -vmovw(xmm3, ptr [rax+0x40]); -vmovw(r9d, xmm1); -vmovw(ptr [rax+0x40], xmm7); vcvtsd2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); vcvtsd2sh(xmm1, xmm2, ptr [rax+0x40]); vcvtsh2sd(xmm1|k1|T_z|T_sae, xmm2, xmm3); diff --git a/test/avx10/saturation.txt b/test/avx10/saturation.txt new file mode 100644 index 00000000..f3ebf3dd --- /dev/null +++ b/test/avx10/saturation.txt @@ -0,0 +1,310 @@ +// +vcvtnebf162ibs(xm1, xm2); +vcvtnebf162ibs(xm1, ptr[rax+128]); +vcvtnebf162ibs(xm1, ptr_b[rax+128]); + +vcvtnebf162ibs(ym1, ym2); +vcvtnebf162ibs(ym1, ptr[rax+128]); +vcvtnebf162ibs(ym1, ptr_b[rax+128]); + +vcvtnebf162ibs(zm1, zm2); +vcvtnebf162ibs(zm1, ptr[rax+128]); +vcvtnebf162ibs(zm1, ptr_b[rax+128]); +// +vcvtnebf162iubs(xm1, xm2); +vcvtnebf162iubs(xm1, ptr[rax+128]); +vcvtnebf162iubs(xm1, ptr_b[rax+128]); + +vcvtnebf162iubs(ym1, ym2); +vcvtnebf162iubs(ym1, ptr[rax+128]); +vcvtnebf162iubs(ym1, ptr_b[rax+128]); + +vcvtnebf162iubs(zm1, zm2); +vcvtnebf162iubs(zm1, ptr[rax+128]); +vcvtnebf162iubs(zm1, ptr_b[rax+128]); +// +vcvttnebf162ibs(xm1, xm2); +vcvttnebf162ibs(xm1, ptr[rax+128]); +vcvttnebf162ibs(xm1, ptr_b[rax+128]); + +vcvttnebf162ibs(ym1, ym2); +vcvttnebf162ibs(ym1, ptr[rax+128]); +vcvttnebf162ibs(ym1, ptr_b[rax+128]); + +vcvttnebf162ibs(zm1, zm2); +vcvttnebf162ibs(zm1, ptr[rax+128]); +vcvttnebf162ibs(zm1, ptr_b[rax+128]); +// +vcvttnebf162iubs(xm1, xm2); +vcvttnebf162iubs(xm1, ptr[rax+128]); +vcvttnebf162iubs(xm1, ptr_b[rax+128]); + +vcvttnebf162iubs(ym1, ym2); +vcvttnebf162iubs(ym1, ptr[rax+128]); +vcvttnebf162iubs(ym1, ptr_b[rax+128]); + +vcvttnebf162iubs(zm1, zm2); +vcvttnebf162iubs(zm1, ptr[rax+128]); +vcvttnebf162iubs(zm1, ptr_b[rax+128]); +// +vcvttpd2qqs(xm1, xm2); +vcvttpd2qqs(xm1, ptr[rax+128]); +vcvttpd2qqs(xm1, ptr_b[rax+128]); + +vcvttpd2qqs(ym1, ym2); +vcvttpd2qqs(ym1, ym2|T_sae); +vcvttpd2qqs(ym1, ptr[rax+128]); +vcvttpd2qqs(ym1, ptr_b[rax+128]); + +vcvttpd2qqs(zm1, zm2); +vcvttpd2qqs(zm1, zm2|T_sae); +vcvttpd2qqs(zm1, ptr[rax+128]); +vcvttpd2qqs(zm1, ptr_b[rax+128]); +// +vcvttpd2uqqs(xm1, xm2); +vcvttpd2uqqs(xm1, ptr[rax+128]); +vcvttpd2uqqs(xm1, ptr_b[rax+128]); + +vcvttpd2uqqs(ym1, ym2); +vcvttpd2uqqs(ym1, ym2|T_sae); +vcvttpd2uqqs(ym1, ptr[rax+128]); +vcvttpd2uqqs(ym1, ptr_b[rax+128]); + +vcvttpd2uqqs(zm1, zm2); +vcvttpd2uqqs(zm1, zm2|T_sae); +vcvttpd2uqqs(zm1, ptr[rax+128]); +vcvttpd2uqqs(zm1, ptr_b[rax+128]); +// +vcvtph2ibs(xm1, xm2); +vcvtph2ibs(xm1, ptr[rax+128]); +vcvtph2ibs(xm1, ptr_b[rax+128]); + +vcvtph2ibs(ym1, ym2); +vcvtph2ibs(ym1, ym2|T_rd_sae); +vcvtph2ibs(ym1, ptr[rax+128]); +vcvtph2ibs(ym1, ptr_b[rax+128]); + +vcvtph2ibs(zm1, zm2); +vcvtph2ibs(zm1, zm2|T_ru_sae); +vcvtph2ibs(zm1, ptr[rax+128]); +vcvtph2ibs(zm1, ptr_b[rax+128]); +// +vcvtph2iubs(xm1, xm2); +vcvtph2iubs(xm1, ptr[rax+128]); +vcvtph2iubs(xm1, ptr_b[rax+128]); + +vcvtph2iubs(ym1, ym2); +vcvtph2iubs(ym1, ym2|T_rd_sae); +vcvtph2iubs(ym1, ptr[rax+128]); +vcvtph2iubs(ym1, ptr_b[rax+128]); + +vcvtph2iubs(zm1, zm2); +vcvtph2iubs(zm1, zm2|T_ru_sae); +vcvtph2iubs(zm1, ptr[rax+128]); +vcvtph2iubs(zm1, ptr_b[rax+128]); +// +vcvttph2ibs(xm1, xm2); +vcvttph2ibs(xm1, ptr[rax+128]); +vcvttph2ibs(xm1, ptr_b[rax+128]); + +vcvttph2ibs(ym1, ym2); +vcvttph2ibs(ym1, ym2|T_rd_sae); +vcvttph2ibs(ym1, ptr[rax+128]); +vcvttph2ibs(ym1, ptr_b[rax+128]); + +vcvttph2ibs(zm1, zm2); +vcvttph2ibs(zm1, zm2|T_ru_sae); +vcvttph2ibs(zm1, ptr[rax+128]); +vcvttph2ibs(zm1, ptr_b[rax+128]); +// +vcvttph2iubs(xm1, xm2); +vcvttph2iubs(xm1, ptr[rax+128]); +vcvttph2iubs(xm1, ptr_b[rax+128]); + +vcvttph2iubs(ym1, ym2); +vcvttph2iubs(ym1, ym2|T_rd_sae); +vcvttph2iubs(ym1, ptr[rax+128]); +vcvttph2iubs(ym1, ptr_b[rax+128]); + +vcvttph2iubs(zm1, zm2); +vcvttph2iubs(zm1, zm2|T_ru_sae); +vcvttph2iubs(zm1, ptr[rax+128]); +vcvttph2iubs(zm1, ptr_b[rax+128]); +// +vcvttps2dqs(xm1, xm2); +vcvttps2dqs(xm1, ptr[rax+128]); +vcvttps2dqs(xm1, ptr_b[rax+128]); + +vcvttps2dqs(ym1, ym2); +vcvttps2dqs(ym1, ym2|T_sae); +vcvttps2dqs(ym1, ptr[rax+128]); +vcvttps2dqs(ym1, ptr_b[rax+128]); + +vcvttps2dqs(zm1, zm2); +vcvttps2dqs(zm1, zm2|T_sae); +vcvttps2dqs(zm1, ptr[rax+128]); +vcvttps2dqs(zm1, ptr_b[rax+128]); +// +vcvtps2ibs(xm1, xm2); +vcvtps2ibs(xm1, ptr[rax+128]); +vcvtps2ibs(xm1, ptr_b[rax+128]); + +vcvtps2ibs(ym1, ym2); +vcvtps2ibs(ym1, ym2|T_rd_sae); +vcvtps2ibs(ym1, ptr[rax+128]); +vcvtps2ibs(ym1, ptr_b[rax+128]); + +vcvtps2ibs(zm1, zm2); +vcvtps2ibs(zm1, zm2|T_ru_sae); +vcvtps2ibs(zm1, ptr[rax+128]); +vcvtps2ibs(zm1, ptr_b[rax+128]); +// +vcvtps2iubs(xm1, xm2); +vcvtps2iubs(xm1, ptr[rax+128]); +vcvtps2iubs(xm1, ptr_b[rax+128]); + +vcvtps2iubs(ym1, ym2); +vcvtps2iubs(ym1, ym2|T_rd_sae); +vcvtps2iubs(ym1, ptr[rax+128]); +vcvtps2iubs(ym1, ptr_b[rax+128]); + +vcvtps2iubs(zm1, zm2); +vcvtps2iubs(zm1, zm2|T_ru_sae); +vcvtps2iubs(zm1, ptr[rax+128]); +vcvtps2iubs(zm1, ptr_b[rax+128]); +// +vcvttps2ibs(xm1, xm2); +vcvttps2ibs(xm1, ptr[rax+128]); +vcvttps2ibs(xm1, ptr_b[rax+128]); + +vcvttps2ibs(ym1, ym2); +vcvttps2ibs(ym1, ym2|T_rd_sae); +vcvttps2ibs(ym1, ptr[rax+128]); +vcvttps2ibs(ym1, ptr_b[rax+128]); + +vcvttps2ibs(zm1, zm2); +vcvttps2ibs(zm1, zm2|T_ru_sae); +vcvttps2ibs(zm1, ptr[rax+128]); +vcvttps2ibs(zm1, ptr_b[rax+128]); +// +vcvttps2iubs(xm1, xm2); +vcvttps2iubs(xm1, ptr[rax+128]); +vcvttps2iubs(xm1, ptr_b[rax+128]); + +vcvttps2iubs(ym1, ym2); +vcvttps2iubs(ym1, ym2|T_rd_sae); +vcvttps2iubs(ym1, ptr[rax+128]); +vcvttps2iubs(ym1, ptr_b[rax+128]); + +vcvttps2iubs(zm1, zm2); +vcvttps2iubs(zm1, zm2|T_ru_sae); +vcvttps2iubs(zm1, ptr[rax+128]); +vcvttps2iubs(zm1, ptr_b[rax+128]); +// +vcvttps2udqs(xm1, xm2); +vcvttps2udqs(xm1, ptr[rax+128]); +vcvttps2udqs(xm1, ptr_b[rax+128]); + +vcvttps2udqs(ym1, ym2); +vcvttps2udqs(ym1, ym2|T_sae); +vcvttps2udqs(ym1, ptr[rax+128]); +vcvttps2udqs(ym1, ptr_b[rax+128]); + +vcvttps2udqs(zm1, zm2); +vcvttps2udqs(zm1, zm2|T_sae); +vcvttps2udqs(zm1, ptr[rax+128]); +vcvttps2udqs(zm1, ptr_b[rax+128]); + +// +vcvttpd2dqs(xm1|k1|T_z, xm2); +vcvttpd2dqs(xm1|k1|T_z, xword [rax+128]); +vcvttpd2dqs(xm1|k1|T_z, xword_b[rax+128]); + +vcvttpd2dqs(xm1|k1|T_z, ym2); +vcvttpd2dqs(xm1|k1|T_z, ym2|T_sae); +vcvttpd2dqs(xm1|k1|T_z, yword [rax+128]); +vcvttpd2dqs(xm1|k1|T_z, yword_b[rax+128]); + +vcvttpd2dqs(ym1|k1|T_z, zm2); +vcvttpd2dqs(ym1|k1|T_z, zm2|T_sae); +vcvttpd2dqs(ym1|k1|T_z, zword [rax+128]); +vcvttpd2dqs(ym1|k1|T_z, zword_b[rax+128]); + +// +vcvttpd2udqs(xm1|k1|T_z, xm2); +vcvttpd2udqs(xm1|k1|T_z, xword [rax+128]); +vcvttpd2udqs(xm1|k1|T_z, xword_b[rax+128]); + +vcvttpd2udqs(xm1|k1|T_z, ym2); +vcvttpd2udqs(xm1|k1|T_z, ym2|T_sae); +vcvttpd2udqs(xm1|k1|T_z, yword [rax+128]); +vcvttpd2udqs(xm1|k1|T_z, yword_b[rax+128]); + +vcvttpd2udqs(ym1|k1|T_z, zm2); +vcvttpd2udqs(ym1|k1|T_z, zm2|T_sae); +vcvttpd2udqs(ym1|k1|T_z, zword [rax+128]); +vcvttpd2udqs(ym1|k1|T_z, zword_b[rax+128]); +// +vcvttps2qqs(xm1|k1|T_z, xm2); +vcvttps2qqs(xm1|k1|T_z, ptr [rax+128]); +vcvttps2qqs(xm1|k1|T_z, ptr_b[rax+128]); + +vcvttps2qqs(ym1|k1|T_z, xm2); +vcvttps2qqs(ym1|k1|T_z, xm2|T_sae); +vcvttps2qqs(ym1|k1|T_z, ptr [rax+128]); +vcvttps2qqs(ym1|k1|T_z, ptr_b[rax+128]); + +vcvttps2qqs(zm1, ym2); +vcvttps2qqs(zm1|k1|T_z, ym2); +vcvttps2qqs(zm1|k1|T_z|T_sae, ym2); +vcvttps2qqs(zm1|k1|T_z, ptr [rax+128]); +vcvttps2qqs(zm1|k1|T_z, ptr_b[rax+128]); + +// +vcvttps2uqqs(xm1|k1|T_z, xm2); +vcvttps2uqqs(xm1|k1|T_z, ptr [rax+128]); +vcvttps2uqqs(xm1|k1|T_z, ptr_b[rax+128]); + +vcvttps2uqqs(ym1|k1|T_z, xm2); +vcvttps2uqqs(ym1|k1|T_z, xm2|T_sae); +vcvttps2uqqs(ym1|k1|T_z, ptr [rax+128]); +vcvttps2uqqs(ym1|k1|T_z, ptr_b[rax+128]); + +vcvttps2uqqs(zm1, ym2); +vcvttps2uqqs(zm1|k1|T_z, ym2); +vcvttps2uqqs(zm1|k1|T_z|T_sae, ym2); +vcvttps2uqqs(zm1|k1|T_z, ptr [rax+128]); +vcvttps2uqqs(zm1|k1|T_z, ptr_b[rax+128]); + +// +vcvttsd2sis(eax, xm1); +vcvttsd2sis(eax, xm1|T_sae); +vcvttsd2sis(eax, ptr[rax+128]); + +vcvttsd2sis(r30, xm1); +vcvttsd2sis(r30, xm1|T_sae); +vcvttsd2sis(r30, ptr[rax+128]); +// +vcvttsd2usis(eax, xm1); +vcvttsd2usis(eax, xm1|T_sae); +vcvttsd2usis(eax, ptr[rax+128]); + +vcvttsd2usis(r30, xm1); +vcvttsd2usis(r30, xm1|T_sae); +vcvttsd2usis(r30, ptr[rax+128]); +// +vcvttss2sis(eax, xm1); +vcvttss2sis(eax, xm1|T_sae); +vcvttss2sis(eax, ptr[rax+128]); + +vcvttss2sis(r30, xm1); +vcvttss2sis(r30, xm1|T_sae); +vcvttss2sis(r30, ptr[rax+128]); +// +vcvttss2usis(eax, xm1); +vcvttss2usis(eax, xm1|T_sae); +vcvttss2usis(eax, ptr[rax+128]); + +vcvttss2usis(r30, xm1); +vcvttss2usis(r30, xm1|T_sae); +vcvttss2usis(r30, ptr[rax+128]); diff --git a/test/avx10_test.cpp b/test/avx10_test.cpp index 5f742fe7..1ceb52a0 100644 --- a/test/avx10_test.cpp +++ b/test/avx10_test.cpp @@ -234,10 +234,10 @@ CYBOZU_TEST_AUTO(vmpsadbw) struct Code : Xbyak::CodeGenerator { Code() { - setDefaultEncoding(); + setDefaultEncodingAVX10(); vmpsadbw(xm1, xm3, xm15, 3); // vex(avx) vmpsadbw(ym1, ym3, ptr[rax+128], 3); // vex(avx2) - setDefaultEncoding(VexEncoding, EvexEncoding); + setDefaultEncodingAVX10(AVX10v2Encoding); vmpsadbw(ym1, ym3, ym15, 3); // evex(avx10.2) vmpsadbw(ym1, ym3, ptr[rax+128], 3); // evex(avx10.2) } diff --git a/test/test_by_xed.cpp b/test/test_by_xed.cpp index ddac779a..9be9199c 100644 --- a/test/test_by_xed.cpp +++ b/test/test_by_xed.cpp @@ -7,7 +7,7 @@ struct Code : Xbyak::CodeGenerator { Code() : Xbyak::CodeGenerator(4096*8) { - setDefaultEncoding(VexEncoding, EvexEncoding); + setDefaultEncodingAVX10(AVX10v2Encoding); #include "tmp.cpp" } }; diff --git a/test/test_by_xed.py b/test/test_by_xed.py index afd77d8a..1e84c6ae 100644 --- a/test/test_by_xed.py +++ b/test/test_by_xed.py @@ -366,7 +366,7 @@ def parseNmemonicTest(): ('vpshldw(xmm9|k3|T_z, xmm2, ptr [rax + 0x40], 5);', Nmemonic('vpshldw', [xmm9, xmm2, Memory(0, rax, None, 0, 0x40), 5], [k3, T_z])), ('vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])), ('vpshrdd xmm5{k3}{z}, xmm2, dword ptr [rax+0x40]{1to4}, 0x5', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])), - ('vcmpph(k1, xm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xm15, Memory(0, rax, None, 0, 64), 1])), + ('vcmpph(k1, xmm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xmm15, Memory(0, rax, None, 0, 64), 1])), ] for (s, expected) in tbl: e = parseNmemonic(s) diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index 552e451e..c0bd83ee 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -155,7 +155,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x7100 /* 0xABCD = A.BC(.D) */ + VERSION = 0x7200 /* 0xABCD = A.BC(.D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED @@ -232,6 +232,7 @@ enum { ERR_CANT_USE_REX2, ERR_INVALID_DFV, ERR_INVALID_REG_IDX, + ERR_BAD_ENCODING_MODE, ERR_INTERNAL // Put it at last. }; @@ -290,6 +291,7 @@ inline const char *ConvertErrorToString(int err) "can't use rex2", "invalid dfv", "invalid reg index", + "bad encoding mode", "internal error" }; assert(ERR_INTERNAL + 1 == sizeof(errTbl) / sizeof(*errTbl)); @@ -1673,7 +1675,9 @@ inline const uint8_t* Label::getAddress() const typedef enum { DefaultEncoding, VexEncoding, - EvexEncoding + EvexEncoding, + PreAVX10v2Encoding, + AVX10v2Encoding } PreferredEncoding; class CodeGenerator : public CodeArray { @@ -2661,21 +2665,24 @@ class CodeGenerator : public CodeArray { if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING) opVex(x, 0, addr, type, code); } - void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0) + void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding enc, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0) { - opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding, typeVex, typeEvex, sel), code, imm); + opAVX_X_X_XM(x1, x2, op, type | orEvexIf(enc, typeVex, typeEvex, sel), code, imm); } - int orEvexIf(PreferredEncoding encoding, uint64_t typeVex, uint64_t typeEvex, int sel) { - if (encoding == DefaultEncoding) { - encoding = defaultEncoding_[sel]; + PreferredEncoding getEncoding(PreferredEncoding enc, int sel) const + { + if (enc == DefaultEncoding) { + enc = defaultEncoding_[sel]; } - if (encoding == EvexEncoding) { + if ((sel == 0 && enc != VexEncoding && enc != EvexEncoding) || (sel == 1 && enc != PreAVX10v2Encoding && enc != AVX10v2Encoding)) XBYAK_THROW_RET(ERR_BAD_ENCODING_MODE, VexEncoding) #ifdef XBYAK_DISABLE_AVX512 - XBYAK_THROW(ERR_EVEX_IS_INVALID) + if (enc == EvexEncoding || enc == AVX10v2Encoding) XBYAK_THROW(ERR_EVEX_IS_INVALID) #endif - return T_MUST_EVEX | typeEvex; - } - return typeVex; + return enc; + } + uint64_t orEvexIf(PreferredEncoding enc, uint64_t typeVex, uint64_t typeEvex, int sel) { + enc = getEncoding(enc, sel); + return ((sel == 0 && enc == VexEncoding) || (sel == 1 && enc != AVX10v2Encoding)) ? typeVex : (T_MUST_EVEX | typeEvex); } void opInOut(const Reg& a, const Reg& d, uint8_t code) { @@ -3132,8 +3139,8 @@ class CodeGenerator : public CodeArray { #endif , isDefaultJmpNEAR_(false) { - // select avx512-vnni, vmpsadbw(avx) setDefaultEncoding(); + setDefaultEncodingAVX10(); labelMgr_.set(this); } void reset() @@ -3170,16 +3177,20 @@ class CodeGenerator : public CodeArray { #undef jnl #endif - // set default encoding - // vnniEnc : control AVX512_VNNI (evex:default) or AVX-VNNI (vex) - // avx10Enc : control mpsadbw, AVX-VNNI-INT8 (vex:default) or AVX10.2 (evex) - void setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding) - { defaultEncoding_[0] = vnniEnc; defaultEncoding_[1] = avx10Enc; } - - void sha1msg12(const Xmm& x, const Operand& op) + // set default encoding of VNNI + // EvexEncoding : AVX512_VNNI, VexEncoding : AVX-VNNI + void setDefaultEncoding(PreferredEncoding enc = EvexEncoding) + { + if (enc != VexEncoding && enc != EvexEncoding) XBYAK_THROW(ERR_BAD_ENCODING_MODE) + defaultEncoding_[0] = enc; + } + // default : PreferredEncoding : AVX-VNNI-INT8/AVX512-FP16 + void setDefaultEncodingAVX10(PreferredEncoding enc = PreAVX10v2Encoding) { - opROO(Reg(), op, x, T_MUST_EVEX, 0xD9); + if (enc != PreAVX10v2Encoding && enc != AVX10v2Encoding) XBYAK_THROW(ERR_BAD_ENCODING_MODE) + defaultEncoding_[1] = enc; } + void bswap(const Reg32e& r) { int idx = r.getIdx(); @@ -3192,6 +3203,48 @@ class CodeGenerator : public CodeArray { } db(0xC8 + (idx & 7)); } + // AVX10 zero-extending for vmovd, vmovw + void opAVX10ZeroExt(const Operand& op1, const Operand& op2, const uint64_t typeTbl[4], const int codeTbl[4], PreferredEncoding enc, int bit) + { + const Operand *p1 = &op1; + const Operand *p2 = &op2; + bool rev = false; + if (p1->isMEM()) { + std::swap(p1, p2); + rev = true; + } + if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) + if (p1->isXMM()) { + std::swap(p1, p2); + rev = !rev; + } + int sel = -1; + if (getEncoding(enc, 1) == AVX10v2Encoding) { + if ((p1->isXMM() || p1->isMEM()) && p2->isXMM()) sel = 2 + int(rev); + } else { + if ((p1->isREG(bit) || p1->isMEM()) && p2->isXMM()) sel = int(rev); + } + if (sel == -1) XBYAK_THROW(ERR_BAD_COMBINATION) + opAVX_X_X_XM(*static_cast(p2), xm0, *p1, typeTbl[sel], codeTbl[sel]); + } + void vmovd(const Operand& op1, const Operand& op2, PreferredEncoding enc = DefaultEncoding) + { + const uint64_t typeTbl[] = { + T_EVEX|T_66|T_0F|T_W0|T_N4, T_EVEX|T_66|T_0F|T_W0|T_N4, // legacy, avx, avx512 + T_MUST_EVEX|T_66|T_0F|T_EW0|T_N4, T_MUST_EVEX|T_F3|T_0F|T_EW0|T_N4, // avx10.2 + }; + const int codeTbl[] = { 0x7E, 0x6E, 0xD6, 0x7E }; + opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, enc, 32); + } + void vmovw(const Operand& op1, const Operand& op2, PreferredEncoding enc = DefaultEncoding) + { + const uint64_t typeTbl[] = { + T_MUST_EVEX|T_66|T_MAP5|T_N2, T_MUST_EVEX|T_66|T_MAP5|T_N2, // avx512-fp16 + T_MUST_EVEX|T_F3|T_MAP5|T_EW0|T_N2, T_MUST_EVEX|T_F3|T_MAP5|T_EW0|T_N2, // avx10.2 + }; + const int codeTbl[] = { 0x7E, 0x6E, 0x7E, 0x6E }; + opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, enc, 16|32|64); + } /* use single byte nop if useMultiByteNop = false */ diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 0397ffdc..087db031 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1,4 +1,4 @@ -const char *getVersionString() const { return "7.10"; } +const char *getVersionString() const { return "7.20"; } void aadd(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); } void aand(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); } void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); } @@ -988,6 +988,7 @@ void sets(const Operand& op) { opSetCC(op, 8); }//-V524 void setz(const Operand& op) { opSetCC(op, 4); }//-V524 void sfence() { db(0x0F); db(0xAE); db(0xF8); } void sha1msg1(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xC9, T_MUST_EVEX, 0xD9); } +void sha1msg12(const Xmm& x, const Operand& op) { opROO(Reg(), op, x, T_MUST_EVEX, 0xD9); } void sha1msg2(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xCA, T_MUST_EVEX, 0xDA); } void sha1nexte(const Xmm& x, const Operand& op) { opSSE_APX(x, op, T_0F38, 0xC8, T_MUST_EVEX, 0xD8); } void sha1rnds4(const Xmm& x, const Operand& op, uint8_t imm) { opSSE_APX(x, op, T_0F3A, 0xCC, T_MUST_EVEX, 0xD4, imm); } @@ -1331,8 +1332,6 @@ void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_ void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x28); } void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x29); } void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F|T_EW0|T_YMM|T_EVEX, 0x28); } -void vmovd(const Operand& op, const Xmm& x) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E); } -void vmovd(const Xmm& x, const Operand& op) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E); } void vmovddup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_DUP|T_F2|T_0F|T_EW1|T_YMM|T_EVEX|T_ER_X|T_ER_Y|T_ER_Z, 0x12); } void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66|T_0F|T_YMM, 0x7F); } void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_YMM, 0x6F); } @@ -2202,6 +2201,8 @@ void vcvtne2ph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X void vcvtne2ph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } void vcvtne2ph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); } void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); } +void vcvtnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x69); } +void vcvtnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6B); } void vcvtneph2bf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } void vcvtneph2bf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } void vcvtneph2hf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } @@ -2212,6 +2213,8 @@ void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0 void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); } void vcvtpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); } void vcvtph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B16, 0x5B); } +void vcvtph2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x69); } +void vcvtph2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x6B); } void vcvtph2pd(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x5A); } void vcvtph2psx(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x13); } void vcvtph2qq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_X|T_MUST_EVEX|T_B16, 0x7B); } @@ -2219,6 +2222,8 @@ void vcvtph2udq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, void vcvtph2uqq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_X|T_MUST_EVEX|T_B16, 0x79); } void vcvtph2uw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); } void vcvtph2w(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); } +void vcvtps2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x69); } +void vcvtps2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x6B); } void vcvtps2phx(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_66|T_MAP5|T_EW0|T_ER_Z|T_MUST_EVEX|T_B32, 0x1D); } void vcvtps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B32, 0x7B); } void vcvtps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x79); } @@ -2235,22 +2240,40 @@ void vcvtsh2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3 void vcvtsi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { if (!(x1.isXMM() && x2.isXMM() && op.isBit(32|64))) XBYAK_THROW(ERR_BAD_COMBINATION) uint64_t type = (T_F3|T_MAP5|T_ER_R|T_MUST_EVEX|T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8)); opVex(x1, &x2, op, type, 0x2A); } void vcvtss2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_MAP5|T_EW0|T_ER_X|T_MUST_EVEX, 0x1D); } void vcvtss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_ER_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x79); } +void vcvttnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x68); } +void vcvttnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6A); } +void vcvttpd2dqs(const Xmm& x, const Operand& op) { opCvt2(x, op, T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); } void vcvttpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x7A); } +void vcvttpd2qqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); } void vcvttpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); } +void vcvttpd2udqs(const Xmm& x, const Operand& op) { opCvt2(x, op, T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6C); } void vcvttpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); } +void vcvttpd2uqqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6C); } void vcvttph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x5B); } +void vcvttph2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x68); } +void vcvttph2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x6A); } void vcvttph2qq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x7A); } void vcvttph2udq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x78); } void vcvttph2uqq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x78); } void vcvttph2uw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x7C); } void vcvttph2w(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x7C); } +void vcvttps2dqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x6D); } +void vcvttps2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x68); } +void vcvttps2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x6A); } void vcvttps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x7A); } +void vcvttps2qqs(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_SAE_Y|T_MUST_EVEX|T_B32, 0x6D); } void vcvttps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x78); } +void vcvttps2udqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x6C); } void vcvttps2uqq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x78); } +void vcvttps2uqqs(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_SAE_Y|T_MUST_EVEX|T_B32, 0x6C); } +void vcvttsd2sis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6D); } void vcvttsd2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_0F|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); } +void vcvttsd2usis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6C); } void vcvttsh2si(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x2C); } void vcvttsh2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); } +void vcvttss2sis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6D); } void vcvttss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); } +void vcvttss2usis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6C); } void vcvtudq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_0F|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x7A); } void vcvtudq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_F2|T_MAP5|T_EW0|T_ER_Z|T_MUST_EVEX|T_B32, 0x7A); } void vcvtudq2ps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_0F|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x7A); } @@ -2374,6 +2397,13 @@ void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) void vmaxpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5F); } void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5F); } void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5F); } +void vminmaxnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x52, imm); } +void vminmaxpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x52, imm); } +void vminmaxph(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_0F3A|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B16, 0x52, imm); } +void vminmaxps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x52, imm); } +void vminmaxsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_X|T_MUST_EVEX, 0x53, imm); } +void vminmaxsh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N2|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x53, imm); } +void vminmaxss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x53, imm); } void vminpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5D); } void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5D); } void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5D); } @@ -2392,9 +2422,6 @@ void vmovdqu8(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_0F void vmovsh(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX|T_M_K, 0x11); } void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); } void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); } -void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } -void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } -void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x6E); } void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A|T_YMM, 0x42, encoding, imm, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1); } void vmulnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x59); } void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59); }