[llvm] [X86] combineKSHIFT - fold kshiftr(kshiftr/extract_subvector(X, C1), C2) --> kshiftr(X, C1+C2) (PR #115528)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 8 10:12:03 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
Merge serial KSHIFTR nodes, possibly separated by EXTRACT_SUBVECTOR, to allow mask instructions to be computed in parallel.
---
Patch is 45.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115528.diff
9 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+21-2)
- (modified) llvm/test/CodeGen/X86/avx512-bugfix-26264.ll (+8-8)
- (modified) llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/pr33349.ll (+12-12)
- (modified) llvm/test/CodeGen/X86/pr34177.ll (+6-6)
- (modified) llvm/test/CodeGen/X86/vec_smulo.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/vec_umulo.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/vector-compress.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll (+145-145)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 19a85a6d7ec6ce..748f885e3f8d90 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58405,11 +58405,30 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
-
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
return DAG.getConstant(0, SDLoc(N), VT);
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ // Fold kshiftr(extract_subvector(X,C1),C2)
+ // --> extract_subvector(kshiftr(X,C1+C2),0)
+ // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
+ if (N->getOpcode() == X86ISD::KSHIFTR) {
+ SDLoc DL(N);
+ if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
+ N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
+ SDValue Src = N->getOperand(0).getOperand(0);
+ uint64_t Amt = N->getConstantOperandVal(1) +
+ N->getOperand(0).getConstantOperandVal(1);
+ EVT SrcVT = Src.getValueType();
+ if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
+ SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
+ DAG.getTargetConstant(Amt, DL, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
+ DAG.getIntPtrConstant(0, DL));
+ }
+ }
+ }
+
APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
return SDValue(N, 0);
diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll
index 537f42dd9c2c59..e0f3b6c4ec90a4 100644
--- a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll
+++ b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll
@@ -7,11 +7,11 @@ define <32 x double> @test_load_32f64(ptr %ptrs, <32 x i1> %mask, <32 x double>
; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
; AVX512BW-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: kshiftrw $8, %k1, %k2
+; AVX512BW-NEXT: kshiftrd $8, %k1, %k2
; AVX512BW-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2}
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
-; AVX512BW-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k2}
+; AVX512BW-NEXT: kshiftrd $24, %k1, %k1
; AVX512BW-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
; AVX512BW-NEXT: retq
%res = call <32 x double> @llvm.masked.load.v32f64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
@@ -24,11 +24,11 @@ define <32 x i64> @test_load_32i64(ptr %ptrs, <32 x i1> %mask, <32 x i64> %src0)
; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
; AVX512BW-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: kshiftrw $8, %k1, %k2
+; AVX512BW-NEXT: kshiftrd $8, %k1, %k2
; AVX512BW-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k2}
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
-; AVX512BW-NEXT: vpblendmq 128(%rdi), %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: vpblendmq 128(%rdi), %zmm3, %zmm2 {%k2}
+; AVX512BW-NEXT: kshiftrd $24, %k1, %k1
; AVX512BW-NEXT: vpblendmq 192(%rdi), %zmm4, %zmm3 {%k1}
; AVX512BW-NEXT: retq
%res = call <32 x i64> @llvm.masked.load.v32i64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0)
diff --git a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
index bd52b9cd41584c..f6e5986afac531 100644
--- a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
+++ b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
@@ -261,11 +261,11 @@ define <32 x double> @test_load_32f64(ptr %ptrs, <32 x i1> %mask, <32 x double>
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k1
; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
-; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: kshiftrd $8, %k1, %k2
; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2}
-; SKX-NEXT: kshiftrd $16, %k1, %k1
-; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k1}
-; SKX-NEXT: kshiftrw $8, %k1, %k1
+; SKX-NEXT: kshiftrd $16, %k1, %k2
+; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k2}
+; SKX-NEXT: kshiftrd $24, %k1, %k1
; SKX-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
; SKX-NEXT: retq
%res = call <32 x double> @llvm.masked.load.v32f64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
diff --git a/llvm/test/CodeGen/X86/pr33349.ll b/llvm/test/CodeGen/X86/pr33349.ll
index 83d3a33572266f..c879cb9867ab29 100644
--- a/llvm/test/CodeGen/X86/pr33349.ll
+++ b/llvm/test/CodeGen/X86/pr33349.ll
@@ -17,23 +17,23 @@ target triple = "x86_64-unknown-linux-gnu"
; KNL-NEXT: fldz
; KNL-NEXT: fld %st(0)
; KNL-NEXT: fcmovne %st(2), %st
-; KNL-NEXT: testb $2, %al
-; KNL-NEXT: fld %st(1)
-; KNL-NEXT: fcmovne %st(3), %st
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $1, %al
+; KNL-NEXT: fld %st(1)
+; KNL-NEXT: fcmovne %st(3), %st
+; KNL-NEXT: testb $2, %al
; KNL-NEXT: fld %st(2)
; KNL-NEXT: fcmovne %st(4), %st
-; KNL-NEXT: testb $2, %al
+; KNL-NEXT: testb $8, %al
; KNL-NEXT: fxch %st(3)
; KNL-NEXT: fcmovne %st(4), %st
; KNL-NEXT: fstp %st(4)
; KNL-NEXT: fxch %st(3)
+; KNL-NEXT: fstpt 30(%rdi)
+; KNL-NEXT: fxch %st(1)
; KNL-NEXT: fstpt 10(%rdi)
; KNL-NEXT: fxch %st(1)
; KNL-NEXT: fstpt (%rdi)
-; KNL-NEXT: fxch %st(1)
-; KNL-NEXT: fstpt 30(%rdi)
; KNL-NEXT: fstpt 20(%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
@@ -49,23 +49,23 @@ target triple = "x86_64-unknown-linux-gnu"
; SKX-NEXT: fldz
; SKX-NEXT: fld %st(0)
; SKX-NEXT: fcmovne %st(2), %st
-; SKX-NEXT: testb $2, %al
-; SKX-NEXT: fld %st(1)
-; SKX-NEXT: fcmovne %st(3), %st
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
+; SKX-NEXT: fld %st(1)
+; SKX-NEXT: fcmovne %st(3), %st
+; SKX-NEXT: testb $2, %al
; SKX-NEXT: fld %st(2)
; SKX-NEXT: fcmovne %st(4), %st
-; SKX-NEXT: testb $2, %al
+; SKX-NEXT: testb $8, %al
; SKX-NEXT: fxch %st(3)
; SKX-NEXT: fcmovne %st(4), %st
; SKX-NEXT: fstp %st(4)
; SKX-NEXT: fxch %st(3)
+; SKX-NEXT: fstpt 30(%rdi)
+; SKX-NEXT: fxch %st(1)
; SKX-NEXT: fstpt 10(%rdi)
; SKX-NEXT: fxch %st(1)
; SKX-NEXT: fstpt (%rdi)
-; SKX-NEXT: fxch %st(1)
-; SKX-NEXT: fstpt 30(%rdi)
; SKX-NEXT: fstpt 20(%rdi)
; SKX-NEXT: retq
bb:
diff --git a/llvm/test/CodeGen/X86/pr34177.ll b/llvm/test/CodeGen/X86/pr34177.ll
index 29922c2ac1a716..5b2431eb214955 100644
--- a/llvm/test/CodeGen/X86/pr34177.ll
+++ b/llvm/test/CodeGen/X86/pr34177.ll
@@ -51,18 +51,18 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr {
; AVX512VL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0
; AVX512VL-NEXT: kshiftrb $2, %k0, %k1
; AVX512VL-NEXT: kmovd %k0, %eax
-; AVX512VL-NEXT: testb $2, %al
+; AVX512VL-NEXT: testb $8, %al
; AVX512VL-NEXT: fld1
; AVX512VL-NEXT: fldz
; AVX512VL-NEXT: fld %st(0)
; AVX512VL-NEXT: fcmovne %st(2), %st
-; AVX512VL-NEXT: testb $1, %al
+; AVX512VL-NEXT: testb $2, %al
; AVX512VL-NEXT: fld %st(1)
; AVX512VL-NEXT: fcmovne %st(3), %st
-; AVX512VL-NEXT: kmovd %k1, %eax
-; AVX512VL-NEXT: testb $2, %al
+; AVX512VL-NEXT: testb $1, %al
; AVX512VL-NEXT: fld %st(2)
; AVX512VL-NEXT: fcmovne %st(4), %st
+; AVX512VL-NEXT: kmovd %k1, %eax
; AVX512VL-NEXT: testb $1, %al
; AVX512VL-NEXT: fxch %st(3)
; AVX512VL-NEXT: fcmovne %st(4), %st
@@ -77,12 +77,12 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr {
; AVX512VL-NEXT: fstpt 10(%rdi)
; AVX512VL-NEXT: fxch %st(1)
; AVX512VL-NEXT: fadd %st, %st(0)
+; AVX512VL-NEXT: fstpt 60(%rdi)
+; AVX512VL-NEXT: fadd %st, %st(0)
; AVX512VL-NEXT: fstpt 20(%rdi)
; AVX512VL-NEXT: fadd %st, %st(0)
; AVX512VL-NEXT: fstpt (%rdi)
; AVX512VL-NEXT: fadd %st, %st(0)
-; AVX512VL-NEXT: fstpt 60(%rdi)
-; AVX512VL-NEXT: fadd %st, %st(0)
; AVX512VL-NEXT: fstpt 40(%rdi)
%1 = icmp eq <4 x i64> <i64 0, i64 1, i64 2, i64 3>, %a
%2 = select <4 x i1> %1, <4 x x86_fp80> <x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000>, <4 x x86_fp80> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 22b5246443fa8a..7e081310c35be5 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -2668,11 +2668,11 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vpcmpneqb %zmm1, %zmm0, %k1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
-; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT: kshiftrq $32, %k1, %k2
+; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
+; AVX512BW-NEXT: kshiftrq $48, %k1, %k1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdi)
; AVX512BW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index 4d7d2573183e07..68c6ca93576b76 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -2329,11 +2329,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vptestmb %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
-; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT: kshiftrq $32, %k1, %k2
+; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
+; AVX512BW-NEXT: kshiftrq $48, %k1, %k1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdi)
; AVX512BW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index f8c076db65de94..17b98b5ebcaeae 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -840,12 +840,12 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512VL-NEXT: subq $576, %rsp # imm = 0x240
; AVX512VL-NEXT: vpsllw $7, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovb2m %zmm0, %k1
+; AVX512VL-NEXT: kshiftrq $48, %k1, %k3
; AVX512VL-NEXT: kshiftrq $32, %k1, %k4
-; AVX512VL-NEXT: kshiftrd $16, %k4, %k3
-; AVX512VL-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VL-NEXT: kshiftrq $16, %k1, %k2
; AVX512VL-NEXT: vpcompressd %zmm1, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp)
-; AVX512VL-NEXT: kshiftrw $8, %k1, %k0
+; AVX512VL-NEXT: kshiftrq $8, %k1, %k0
; AVX512VL-NEXT: kxorw %k0, %k1, %k0
; AVX512VL-NEXT: kshiftrw $4, %k0, %k5
; AVX512VL-NEXT: kxorw %k5, %k0, %k0
@@ -859,7 +859,7 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp,%rax,4)
; AVX512VL-NEXT: vpcompressd %zmm3, %zmm0 {%k4} {z}
; AVX512VL-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: kshiftrw $8, %k4, %k0
+; AVX512VL-NEXT: kshiftrq $40, %k1, %k0
; AVX512VL-NEXT: kxorw %k0, %k4, %k0
; AVX512VL-NEXT: kshiftrw $4, %k0, %k4
; AVX512VL-NEXT: kxorw %k4, %k0, %k0
diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
index 85e782e9083492..36a902637272d7 100644
--- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -256,12 +256,12 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
@@ -277,12 +277,12 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
@@ -409,19 +409,19 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
@@ -444,19 +444,19 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
@@ -2605,12 +2605,12 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
@@ -2626,12 +2626,12 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
@@ -2753,19 +2753,19 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/115528
More information about the llvm-commits
mailing list