[llvm] 0e89ff8 - [X86] SimplifyDemandedBits - only narrow a broadcast source if we only have one use.

Sun Sep 19 14:54:15 PDT 2021

Author: Simon Pilgrim
Date: 2021-09-19T22:53:30+01:00
New Revision: 0e89ff8195e994e5051f19669e1044d47120ac06

URL: https://github.com/llvm/llvm-project/commit/0e89ff8195e994e5051f19669e1044d47120ac06
DIFF: https://github.com/llvm/llvm-project/commit/0e89ff8195e994e5051f19669e1044d47120ac06.diff

LOG: [X86] SimplifyDemandedBits - only narrow a broadcast source if we only have one use.

Helps with the regression noted on D109065 - don't truncate a broadcast source if the source has multiple uses.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/oddshuffles.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3187c41a4ab1..3520bbfe1983 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -40381,7 +40381,8 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
     // Don't attempt this on AVX512 as it might affect broadcast folding.
     // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
     if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
-        OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2)) {
+        OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2) &&
+        Src->hasOneUse()) {
       MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
       SDValue NewSrc =
           TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);

diff  --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 0a3ddb81d345..b7f7321a8d3d 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -2261,13 +2261,10 @@ define <16 x i32> @splat_v3i32(<3 x i32>* %ptr) {
 ;
 ; AVX1-LABEL: splat_v3i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    movq (%rdi), %rax
-; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7]
-; AVX1-NEXT:    vmovd %eax, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: splat_v3i32:
@@ -2289,13 +2286,10 @@ define <16 x i32> @splat_v3i32(<3 x i32>* %ptr) {
 ;
 ; XOP-LABEL: splat_v3i32:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    movq (%rdi), %rax
-; XOP-NEXT:    vmovq %rax, %xmm0
-; XOP-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7]
-; XOP-NEXT:    vmovd %eax, %xmm2
-; XOP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7]
+; XOP-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; XOP-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7]
+; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7]
 ; XOP-NEXT:    retq
   %1 = load <3 x i32>, <3 x i32>* %ptr, align 1
   %2 = shufflevector <3 x i32> %1, <3 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>