[llvm] goldsteinn/bugfix pr67287 (PR #68369)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 5 18:05:33 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
<details>
<summary>Changes</summary>
- [X86] Add tests for incorrectly optimizing out shuffle used in `movmsk`; PR67287
- [X86] Fix/improve logic for optimizing `movmsk(bitcast(shuffle(x)))`; PR67287
---
Full diff: https://github.com/llvm/llvm-project/pull/68369.diff
3 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+41-7)
- (modified) llvm/test/CodeGen/X86/combine-ptest.ll (-1)
- (modified) llvm/test/CodeGen/X86/movmsk-cmp.ll (+138)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c4cd2a672fe7b26..dab823e71aa9367 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45836,18 +45836,52 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
// MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
SmallVector<int, 32> ShuffleMask;
SmallVector<SDValue, 2> ShuffleInputs;
+ SDValue BaseVec = peekThroughBitcasts(Vec);
if (NumElts <= CmpBits &&
- getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
- ShuffleMask, DAG) &&
+ getTargetShuffleInputs(BaseVec, ShuffleInputs, ShuffleMask, DAG) &&
ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
unsigned NumShuffleElts = ShuffleMask.size();
- APInt DemandedElts = APInt::getZero(NumShuffleElts);
- for (int M : ShuffleMask) {
- assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
- DemandedElts.setBit(M);
+
+ APInt Result = APInt::getZero(NumShuffleElts);
+ APInt ImportantLocs;
+ // Since we peek through a bitcast, we need to be careful if the base vector
+ // type has smaller elements than the MOVMSK type. In that case, even if
+ // all the elements are demanded by the shuffle mask, only the "high"
+ // elements which have highbits that align with highbits in the MOVMSK vec
+ // elements are actually demanded. A simplification of spurious operations
+ // on the "low" elements take place during other simplifications.
+ //
+ // For example:
+ // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
+ // demanded, because we are swapping around the result can change.
+ //
+ // To address this, we need to make sure all the "high" elements are moved
+ // to other "high" locations.
+ MVT BaseVT = BaseVec.getSimpleValueType();
+ unsigned BaseNumElts = BaseVT.getVectorNumElements();
+ if (BaseNumElts > NumElts) {
+ ImportantLocs = APInt::getZero(NumShuffleElts);
+ assert((BaseNumElts % NumElts) == 0 &&
+ "Vec with unsupported element size");
+ unsigned Scale = BaseNumElts / NumElts;
+ for (unsigned i = 0; i < BaseNumElts; ++i) {
+ if ((i % Scale) == (Scale - 1))
+ ImportantLocs.setBit(i);
+ }
+ } else {
+ ImportantLocs = APInt::getAllOnes(NumShuffleElts);
+ }
+
+ for (unsigned ShufSrc = 0; ShufSrc < ShuffleMask.size(); ++ShufSrc) {
+ int ShufDst = ShuffleMask[ShufSrc];
+ assert(0 <= ShufDst && ShufDst < (int)NumShuffleElts &&
+ "Bad unary shuffle index");
+ if (ImportantLocs[ShufSrc] && ImportantLocs[ShufDst])
+ Result.setBit(ShufSrc);
}
- if (DemandedElts.isAllOnes()) {
+
+ if (Result == ImportantLocs) {
SDLoc DL(EFLAGS);
SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
diff --git a/llvm/test/CodeGen/X86/combine-ptest.ll b/llvm/test/CodeGen/X86/combine-ptest.ll
index 337edef96beee2c..e5854564251a3e8 100644
--- a/llvm/test/CodeGen/X86/combine-ptest.ll
+++ b/llvm/test/CodeGen/X86/combine-ptest.ll
@@ -265,7 +265,6 @@ define i32 @ptestz_v2i64_signbits(<2 x i64> %c, i32 %a, i32 %b) {
; SSE41-LABEL: ptestz_v2i64_signbits:
; SSE41: # %bb.0:
; SSE41-NEXT: movl %edi, %eax
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: movmskps %xmm0, %ecx
; SSE41-NEXT: testl %ecx, %ecx
; SSE41-NEXT: cmovnel %esi, %eax
diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index a0901e265f5ae97..f26bbb7e5c2bdac 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -4430,3 +4430,141 @@ define i32 @PR39665_c_ray_opt(<2 x double> %x, <2 x double> %y) {
%r = select i1 %u, i32 42, i32 99
ret i32 %r
}
+
+define i32 @pr67287(<2 x i64> %broadcast.splatinsert25) {
+; SSE2-LABEL: pr67287:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
+; SSE2-NEXT: movmskpd %xmm0, %eax
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: jne .LBB97_2
+; SSE2-NEXT: # %bb.1: # %entry
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: testb $1, %al
+; SSE2-NEXT: jne .LBB97_2
+; SSE2-NEXT: # %bb.3: # %middle.block
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: retq
+; SSE2-NEXT: .LBB97_2:
+; SSE2-NEXT: movw $0, 0
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: pr67287:
+; SSE41: # %bb.0: # %entry
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT: movmskpd %xmm0, %eax
+; SSE41-NEXT: testl %eax, %eax
+; SSE41-NEXT: jne .LBB97_2
+; SSE41-NEXT: # %bb.1: # %entry
+; SSE41-NEXT: movd %xmm0, %eax
+; SSE41-NEXT: testb $1, %al
+; SSE41-NEXT: jne .LBB97_2
+; SSE41-NEXT: # %bb.3: # %middle.block
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: retq
+; SSE41-NEXT: .LBB97_2:
+; SSE41-NEXT: movw $0, 0
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: pr67287:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vtestpd %xmm0, %xmm0
+; AVX1-NEXT: jne .LBB97_2
+; AVX1-NEXT: # %bb.1: # %entry
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: jne .LBB97_2
+; AVX1-NEXT: # %bb.3: # %middle.block
+; AVX1-NEXT: xorl %eax, %eax
+; AVX1-NEXT: retq
+; AVX1-NEXT: .LBB97_2:
+; AVX1-NEXT: movw $0, 0
+; AVX1-NEXT: xorl %eax, %eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: pr67287:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vtestpd %xmm0, %xmm0
+; AVX2-NEXT: jne .LBB97_2
+; AVX2-NEXT: # %bb.1: # %entry
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: jne .LBB97_2
+; AVX2-NEXT: # %bb.3: # %middle.block
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: retq
+; AVX2-NEXT: .LBB97_2:
+; AVX2-NEXT: movw $0, 0
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: retq
+;
+; KNL-LABEL: pr67287:
+; KNL: # %bb.0: # %entry
+; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; KNL-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: testb $3, %al
+; KNL-NEXT: jne .LBB97_2
+; KNL-NEXT: # %bb.1: # %entry
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: testb $1, %al
+; KNL-NEXT: jne .LBB97_2
+; KNL-NEXT: # %bb.3: # %middle.block
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vzeroupper
+; KNL-NEXT: retq
+; KNL-NEXT: .LBB97_2:
+; KNL-NEXT: movw $0, 0
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vzeroupper
+; KNL-NEXT: retq
+;
+; SKX-LABEL: pr67287:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k0
+; SKX-NEXT: kortestb %k0, %k0
+; SKX-NEXT: jne .LBB97_2
+; SKX-NEXT: # %bb.1: # %entry
+; SKX-NEXT: kmovd %k0, %eax
+; SKX-NEXT: testb $1, %al
+; SKX-NEXT: jne .LBB97_2
+; SKX-NEXT: # %bb.3: # %middle.block
+; SKX-NEXT: xorl %eax, %eax
+; SKX-NEXT: retq
+; SKX-NEXT: .LBB97_2:
+; SKX-NEXT: movw $0, 0
+; SKX-NEXT: xorl %eax, %eax
+; SKX-NEXT: retq
+entry:
+ %0 = and <2 x i64> %broadcast.splatinsert25, <i64 4294967295, i64 4294967295>
+ %1 = icmp eq <2 x i64> %0, zeroinitializer
+ %shift = shufflevector <2 x i1> %1, <2 x i1> zeroinitializer, <2 x i32> <i32 1, i32 poison>
+ %2 = or <2 x i1> %1, %shift
+ %3 = extractelement <2 x i1> %2, i64 0
+ %4 = extractelement <2 x i1> %1, i64 0
+ %5 = or i1 %3, %4
+ br i1 %5, label %6, label %middle.block
+
+6: ; preds = %entry
+ store i16 0, ptr null, align 2
+ br label %middle.block
+
+middle.block: ; preds = %6, %entry
+ ret i32 0
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/68369
More information about the llvm-commits
mailing list