[llvm] [X86] Improve optmasks handling for AVX10.1-256 (PR #73074)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 21 19:18:53 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Evgenii Kudriashov (e-kud)
<details>
<summary>Changes</summary>
Quadword opmask instructions are only supported on processors supporting vector lengths of 512 bits.
---
Patch is 223.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73074.diff
10 Files Affected:
- (modified) llvm/lib/Target/X86/X86DomainReassignment.cpp (+14-21)
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+4-1)
- (modified) llvm/lib/Target/X86/X86InstrInfo.cpp (+7-5)
- (modified) llvm/lib/Target/X86/X86Subtarget.h (+2-1)
- (modified) llvm/test/CodeGen/X86/avx512-mask-op.ll (+831-29)
- (modified) llvm/test/CodeGen/X86/avx512-vec-cmp.ll (+924-151)
- (modified) llvm/test/CodeGen/X86/avx512bw-mask-op.ll (+77-28)
- (modified) llvm/test/CodeGen/X86/kshift.ll (+301)
- (modified) llvm/test/CodeGen/X86/movmsk-cmp.ll (+738)
- (modified) llvm/tools/llvm-exegesis/lib/X86/Target.cpp (+2-1)
``````````diff
diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp
index fa8d5c752a3d273..a1681d9ff73ee5e 100644
--- a/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -651,37 +651,30 @@ void X86DomainReassignment::initConverters() {
if (STI->hasBWI()) {
createReplacer(X86::MOV32rm, X86::KMOVDkm);
- createReplacer(X86::MOV64rm, X86::KMOVQkm);
-
createReplacer(X86::MOV32mr, X86::KMOVDmk);
- createReplacer(X86::MOV64mr, X86::KMOVQmk);
-
createReplacer(X86::MOV32rr, X86::KMOVDkk);
- createReplacer(X86::MOV64rr, X86::KMOVQkk);
-
createReplacer(X86::SHR32ri, X86::KSHIFTRDri);
- createReplacer(X86::SHR64ri, X86::KSHIFTRQri);
-
createReplacer(X86::SHL32ri, X86::KSHIFTLDri);
- createReplacer(X86::SHL64ri, X86::KSHIFTLQri);
-
createReplacer(X86::ADD32rr, X86::KADDDrr);
- createReplacer(X86::ADD64rr, X86::KADDQrr);
-
createReplacer(X86::NOT32r, X86::KNOTDrr);
- createReplacer(X86::NOT64r, X86::KNOTQrr);
-
createReplacer(X86::OR32rr, X86::KORDrr);
- createReplacer(X86::OR64rr, X86::KORQrr);
-
createReplacer(X86::AND32rr, X86::KANDDrr);
- createReplacer(X86::AND64rr, X86::KANDQrr);
-
createReplacer(X86::ANDN32rr, X86::KANDNDrr);
- createReplacer(X86::ANDN64rr, X86::KANDNQrr);
-
createReplacer(X86::XOR32rr, X86::KXORDrr);
- createReplacer(X86::XOR64rr, X86::KXORQrr);
+
+ if (STI->hasEVEX512()) {
+ createReplacer(X86::MOV64rm, X86::KMOVQkm);
+ createReplacer(X86::MOV64mr, X86::KMOVQmk);
+ createReplacer(X86::MOV64rr, X86::KMOVQkk);
+ createReplacer(X86::SHR64ri, X86::KSHIFTRQri);
+ createReplacer(X86::SHL64ri, X86::KSHIFTLQri);
+ createReplacer(X86::ADD64rr, X86::KADDQrr);
+ createReplacer(X86::NOT64r, X86::KNOTQrr);
+ createReplacer(X86::OR64rr, X86::KORQrr);
+ createReplacer(X86::AND64rr, X86::KANDQrr);
+ createReplacer(X86::ANDN64rr, X86::KANDNQrr);
+ createReplacer(X86::XOR64rr, X86::KXORQrr);
+ }
// TODO: KTEST is not a replacement for TEST due to flag differences. Need
// to prove only Z flag is used.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 011baa545dd82fe..1ad3ac1f7b89c03 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2056,9 +2056,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// AVX512BW..
if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
- addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
+ if (Subtarget.hasEVEX512())
+ addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
+ if (VT == MVT::v64i1 && !Subtarget.hasEVEX512())
+ continue;
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 3ca7b427ae2067f..9424319cf7dfcb9 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3526,7 +3526,7 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
// All KMASK RegClasses hold the same k registers, can be tested against anyone.
if (X86::VK16RegClass.contains(SrcReg)) {
if (X86::GR64RegClass.contains(DestReg)) {
- assert(Subtarget.hasBWI());
+ assert(Subtarget.hasBWI() && Subtarget.hasEVEX512());
return X86::KMOVQrk;
}
if (X86::GR32RegClass.contains(DestReg))
@@ -3539,7 +3539,7 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
// All KMASK RegClasses hold the same k registers, can be tested against anyone.
if (X86::VK16RegClass.contains(DestReg)) {
if (X86::GR64RegClass.contains(SrcReg)) {
- assert(Subtarget.hasBWI());
+ assert(Subtarget.hasBWI() && Subtarget.hasEVEX512());
return X86::KMOVQkr;
}
if (X86::GR32RegClass.contains(SrcReg))
@@ -3653,7 +3653,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
Opc = X86::VMOVAPSZrr;
// All KMASK RegClasses hold the same k registers, can be tested against anyone.
else if (X86::VK16RegClass.contains(DestReg, SrcReg))
- Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk;
+ Opc = Subtarget.hasBWI() && Subtarget.hasEVEX512() ? X86::KMOVQkk
+ : X86::KMOVWkk;
if (!Opc)
Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
@@ -3773,7 +3774,8 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
if (X86::RFP64RegClass.hasSubClassEq(RC))
return Load ? X86::LD_Fp64m : X86::ST_Fp64m;
if (X86::VK64RegClass.hasSubClassEq(RC)) {
- assert(STI.hasBWI() && "KMOVQ requires BWI");
+ assert(STI.hasBWI() && STI.hasEVEX512() &&
+ "KMOVQ requires BWI with 512-bit vectors");
return Load ? X86::KMOVQkm : X86::KMOVQmk;
}
llvm_unreachable("Unknown 8-byte regclass");
@@ -10144,7 +10146,7 @@ void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
return;
// KXOR is safe to use because it doesn't affect flags.
- unsigned Op = ST.hasBWI() ? X86::KXORQrr : X86::KXORWrr;
+ unsigned Op = ST.hasBWI() && ST.hasEVEX512() ? X86::KXORQrr : X86::KXORWrr;
BuildMI(MBB, Iter, DL, get(Op), Reg)
.addReg(Reg, RegState::Undef)
.addReg(Reg, RegState::Undef);
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index a458b5f9ec8fbb9..47d24f4be58a3e1 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -244,7 +244,8 @@ class X86Subtarget final : public X86GenSubtargetInfo {
// TODO: Currently we're always allowing widening on CPUs without VLX,
// because for many cases we don't have a better option.
bool canExtendTo512DQ() const {
- return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512);
+ return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512) &&
+ hasEVEX512();
}
bool canExtendTo512BW() const {
return hasBWI() && canExtendTo512DQ();
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 9e689341f7b88e3..99eef49417f33b6 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -3,6 +3,7 @@
; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefixes=CHECK,SKX
; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW
; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512DQ
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx10.1-256 | FileCheck %s --check-prefixes=CHECK,AVX10-256
; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=X86
@@ -131,6 +132,13 @@ define void @mask8_mem(ptr %ptr) {
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: mask8_mem:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: kmovb (%rdi), %k0
+; AVX10-256-NEXT: knotb %k0, %k0
+; AVX10-256-NEXT: kmovb %k0, (%rdi)
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: mask8_mem:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -205,6 +213,15 @@ define i16 @mand16_mem(ptr %x, ptr %y) {
; AVX512DQ-NEXT: ## kill: def $ax killed $ax killed $eax
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: mand16_mem:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: kmovw (%rdi), %k0
+; AVX10-256-NEXT: kmovw (%rsi), %k1
+; AVX10-256-NEXT: korw %k1, %k0, %k0
+; AVX10-256-NEXT: kmovd %k0, %eax
+; AVX10-256-NEXT: ## kill: def $ax killed $ax killed $eax
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: mand16_mem:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -257,6 +274,14 @@ define i8 @shuf_test1(i16 %v) nounwind {
; AVX512DQ-NEXT: ## kill: def $al killed $al killed $eax
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: shuf_test1:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: kmovd %edi, %k0
+; AVX10-256-NEXT: kshiftrw $8, %k0, %k0
+; AVX10-256-NEXT: kmovd %k0, %eax
+; AVX10-256-NEXT: ## kill: def $al killed $al killed $eax
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: shuf_test1:
; X86: ## %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
@@ -304,6 +329,15 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: zext_test1:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: vpcmpnleud %ymm2, %ymm0, %k0
+; AVX10-256-NEXT: kshiftrb $5, %k0, %k0
+; AVX10-256-NEXT: kmovd %k0, %eax
+; AVX10-256-NEXT: andl $1, %eax
+; AVX10-256-NEXT: vzeroupper
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: zext_test1:
; X86: ## %bb.0:
; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
@@ -359,6 +393,16 @@ define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: zext_test2:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: vpcmpnleud %ymm2, %ymm0, %k0
+; AVX10-256-NEXT: kshiftrb $5, %k0, %k0
+; AVX10-256-NEXT: kmovd %k0, %eax
+; AVX10-256-NEXT: andl $1, %eax
+; AVX10-256-NEXT: ## kill: def $ax killed $ax killed $eax
+; AVX10-256-NEXT: vzeroupper
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: zext_test2:
; X86: ## %bb.0:
; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
@@ -415,6 +459,16 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: zext_test3:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: vpcmpnleud %ymm2, %ymm0, %k0
+; AVX10-256-NEXT: kshiftrb $5, %k0, %k0
+; AVX10-256-NEXT: kmovd %k0, %eax
+; AVX10-256-NEXT: andb $1, %al
+; AVX10-256-NEXT: ## kill: def $al killed $al killed $eax
+; AVX10-256-NEXT: vzeroupper
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: zext_test3:
; X86: ## %bb.0:
; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
@@ -506,6 +560,14 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: test4:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: vpcmpgtq %ymm3, %ymm2, %k1
+; AVX10-256-NEXT: vpcmpleq %ymm1, %ymm0, %k0 {%k1}
+; AVX10-256-NEXT: vpmovm2d %k0, %xmm0
+; AVX10-256-NEXT: vzeroupper
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: test4:
; X86: ## %bb.0:
; X86-NEXT: vpcmpgtq %ymm3, %ymm2, %k1
@@ -567,6 +629,13 @@ define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: test5:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: vpcmpgtq %xmm0, %xmm1, %k1
+; AVX10-256-NEXT: vpcmpleq %xmm3, %xmm2, %k0 {%k1}
+; AVX10-256-NEXT: vpmovm2q %k0, %xmm0
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: test5:
; X86: ## %bb.0:
; X86-NEXT: vpcmpgtq %xmm0, %xmm1, %k1
@@ -645,6 +714,14 @@ define void @test7(<8 x i1> %mask) {
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: test7:
+; AVX10-256: ## %bb.0: ## %allocas
+; AVX10-256-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX10-256-NEXT: vpmovw2m %xmm0, %k0
+; AVX10-256-NEXT: kmovd %k0, %eax
+; AVX10-256-NEXT: orb $85, %al
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: test7:
; X86: ## %bb.0: ## %allocas
; X86-NEXT: vpsllw $15, %xmm0, %xmm0
@@ -732,6 +809,24 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: test8:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: cmpl %esi, %edi
+; AVX10-256-NEXT: jg LBB17_1
+; AVX10-256-NEXT: ## %bb.2:
+; AVX10-256-NEXT: kxorw %k0, %k0, %k0
+; AVX10-256-NEXT: vpmovm2b %k0, %xmm0
+; AVX10-256-NEXT: vzeroupper
+; AVX10-256-NEXT: retq
+; AVX10-256-NEXT: LBB17_1:
+; AVX10-256-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX10-256-NEXT: vpcmpgtd %ymm2, %ymm0, %k0
+; AVX10-256-NEXT: vpcmpgtd %ymm2, %ymm1, %k1
+; AVX10-256-NEXT: kunpckbw %k0, %k1, %k0
+; AVX10-256-NEXT: vpmovm2b %k0, %xmm0
+; AVX10-256-NEXT: vzeroupper
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: test8:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -821,6 +916,20 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: test9:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: cmpl %esi, %edi
+; AVX10-256-NEXT: jg LBB18_1
+; AVX10-256-NEXT: ## %bb.2:
+; AVX10-256-NEXT: vpsllw $7, %xmm1, %xmm0
+; AVX10-256-NEXT: jmp LBB18_3
+; AVX10-256-NEXT: LBB18_1:
+; AVX10-256-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX10-256-NEXT: LBB18_3:
+; AVX10-256-NEXT: vpmovb2m %xmm0, %k0
+; AVX10-256-NEXT: vpmovm2b %k0, %xmm0
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: test9:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -907,6 +1016,20 @@ define <8 x i1> @test10(<8 x i1>%a, <8 x i1>%b, i32 %a1, i32 %b1) {
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: test10:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: cmpl %esi, %edi
+; AVX10-256-NEXT: jg LBB19_1
+; AVX10-256-NEXT: ## %bb.2:
+; AVX10-256-NEXT: vpsllw $15, %xmm1, %xmm0
+; AVX10-256-NEXT: jmp LBB19_3
+; AVX10-256-NEXT: LBB19_1:
+; AVX10-256-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX10-256-NEXT: LBB19_3:
+; AVX10-256-NEXT: vpmovw2m %xmm0, %k0
+; AVX10-256-NEXT: vpmovm2w %k0, %xmm0
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: test10:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -989,6 +1112,20 @@ define <4 x i1> @test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) {
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: test11:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: cmpl %esi, %edi
+; AVX10-256-NEXT: jg LBB20_1
+; AVX10-256-NEXT: ## %bb.2:
+; AVX10-256-NEXT: vpslld $31, %xmm1, %xmm0
+; AVX10-256-NEXT: jmp LBB20_3
+; AVX10-256-NEXT: LBB20_1:
+; AVX10-256-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX10-256-NEXT: LBB20_3:
+; AVX10-256-NEXT: vpmovd2m %xmm0, %k0
+; AVX10-256-NEXT: vpmovm2d %k0, %xmm0
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: test11:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -1120,6 +1257,16 @@ define <16 x i1> @test15(i32 %x, i32 %y) {
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: test15:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: cmpl %esi, %edi
+; AVX10-256-NEXT: movl $21845, %eax ## imm = 0x5555
+; AVX10-256-NEXT: movl $1, %ecx
+; AVX10-256-NEXT: cmovgl %eax, %ecx
+; AVX10-256-NEXT: kmovd %ecx, %k0
+; AVX10-256-NEXT: vpmovm2b %k0, %xmm0
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: test15:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -1231,6 +1378,23 @@ define <64 x i8> @test16(i64 %x) {
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: test16:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: kmovd %edi, %k0
+; AVX10-256-NEXT: shrq $32, %rdi
+; AVX10-256-NEXT: kmovd %edi, %k1
+; AVX10-256-NEXT: movl $-33, %eax
+; AVX10-256-NEXT: kmovd %eax, %k2
+; AVX10-256-NEXT: kandd %k2, %k0, %k0
+; AVX10-256-NEXT: movb $1, %al
+; AVX10-256-NEXT: kmovd %eax, %k2
+; AVX10-256-NEXT: kshiftld $31, %k2, %k2
+; AVX10-256-NEXT: kshiftrd $26, %k2, %k2
+; AVX10-256-NEXT: kord %k2, %k0, %k0
+; AVX10-256-NEXT: vpmovm2b %k0, %ymm0
+; AVX10-256-NEXT: vpmovm2b %k1, %ymm1
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: test16:
; X86: ## %bb.0:
; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0
@@ -1350,6 +1514,24 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: test17:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: kmovd %edi, %k0
+; AVX10-256-NEXT: shrq $32, %rdi
+; AVX10-256-NEXT: kmovd %edi, %k1
+; AVX10-256-NEXT: cmpl %edx, %esi
+; AVX10-256-NEXT: setg %al
+; AVX10-256-NEXT: movl $-33, %ecx
+; AVX10-256-NEXT: kmovd %ecx, %k2
+; AVX10-256-NEXT: kandd %k2, %k0, %k0
+; AVX10-256-NEXT: kmovd %eax, %k2
+; AVX10-256-NEXT: kshiftld $31, %k2, %k2
+; AVX10-256-NEXT: kshiftrd $26, %k2, %k2
+; AVX10-256-NEXT: kord %k2, %k0, %k0
+; AVX10-256-NEXT: vpmovm2b %k0, %ymm0
+; AVX10-256-NEXT: vpmovm2b %k1, %ymm1
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: test17:
; X86: ## %bb.0:
; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0
@@ -1455,6 +1637,24 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: test18:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: kmovd %edi, %k0
+; AVX10-256-NEXT: kmovd %esi, %k1
+; AVX10-256-NEXT: kshiftrw $8, %k1, %k2
+; AVX10-256-NEXT: kshiftrw $9, %k1, %k1
+; AVX10-256-NEXT: movb $-65, %al
+; AVX10-256-NEXT: kmovd %eax, %k3
+; AVX10-256-NEXT: kandb %k3, %k0, %k0
+; AVX10-256-NEXT: kshiftlb $6, %k1, %k1
+; AVX10-256-NEXT: korb %k1, %k0, %k0
+; AVX10-256-NEXT: kshiftlb $1, %k0, %k0
+; AVX10-256-NEXT: kshiftrb $1, %k0, %k0
+; AVX10-256-NEXT: kshiftlb $7, %k2, %k1
+; AVX10-256-NEXT: korb %k1, %k0, %k0
+; AVX10-256-NEXT: vpmovm2w %k0, %xmm0
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: test18:
; X86: ## %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0
@@ -1521,6 +1721,15 @@ define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
; AVX512DQ-NEXT: vpandq %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: test21:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: vpsllw $7, %ymm2, %ymm2
+; AVX10-256-NEXT: vpmovb2m %ymm2, %k1
+; AVX10-256-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
+; AVX10-256-NEXT: kshiftrd $16, %k1, %k1
+; AVX10-256-NEXT: vmovdqu16 %ymm1, %ymm1 {%k1} {z}
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: test21:
; X86: ## %bb.0:
; X86-NEXT: vpsllw $7, %ymm1, %ymm1
@@ -1571,6 +1780,13 @@ define void @test22(<4 x i1> %a, ptr %addr) {
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: test22:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX10-256-NEXT: vpmovd2m %xmm0, %k0
+; AVX10-256-NEXT: kmovb %k0, (%rdi)
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: test22:
; X86: ## %bb.0:
; X86-NEXT: vpslld $31, %xmm0, %xmm0
@@ -1622,6 +1838,13 @@ define void @test23(<2 x i1> %a, ptr %addr) {
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: test23:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX10-256-NEXT: vpmovq2m %xmm0, %k0
+; AVX10-256-NEXT: kmovb %k0, (%rdi)
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: test23:
; X86: ## %bb.0:
; X86-NEXT: vpsllq $63, %xmm0, %xmm0
@@ -1672,6 +1895,15 @@ define void @store_v1i1(<1 x i1> %c , ptr %ptr) {
; AVX512DQ-NEXT: kmovb %k0, (%rsi)
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: store_v1i1:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: kmovd %edi, %k0
+; AVX10-256-NEXT: knotw %k0, %k0
+; AVX10-256-NEXT: kshiftlb $7, %k0, %k0
+; AVX10-256-NEXT: kshiftrb $7, %k0, %k0
+; AVX10-256-NEXT: kmovb %k0, (%rsi)
+; AVX10-256-NEXT: retq
+;
; X86-LABEL: store_v1i1:
; X86: ## %bb.0:
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0
@@ -1730,6 +1962,16 @@ define void @store_v2i1(<2 x i1> %c , ptr %ptr) {
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
+; AVX10-256-LABEL: store_v2i1:
+; AVX10-256: ## %bb.0:
+; AVX10-256-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX10-256-NEXT: vpmovq2m %xmm0, %k0
+...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/73074
More information about the llvm-commits
mailing list