[llvm] 5f2fe48 - [X86][TLI] SimplifyDemandedVectorEltsForTargetNode(): don't break apart broadcasts from which not just the 0'th elt is demanded

Sun Sep 19 07:39:29 PDT 2021

Author: Roman Lebedev
Date: 2021-09-19T17:38:32+03:00
New Revision: 5f2fe48d06c742872804da8b3d86596ed2bb9acb

URL: https://github.com/llvm/llvm-project/commit/5f2fe48d06c742872804da8b3d86596ed2bb9acb
DIFF: https://github.com/llvm/llvm-project/commit/5f2fe48d06c742872804da8b3d86596ed2bb9acb.diff

LOG: [X86][TLI] SimplifyDemandedVectorEltsForTargetNode(): don't break apart broadcasts from which not just the 0'th elt is demanded

Apparently this has no test coverage before D108382,
but D108382 itself shows a few regressions that this fixes.

It doesn't seem worthwhile breaking apart broadcasts,
assuming we want the broadcasted value to be preset in several elements,
not just the 0'th one.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D108411

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
    llvm/test/CodeGen/X86/horizontal-sum.ll
    llvm/test/CodeGen/X86/sse41.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8e4f238b2f07..81b011c27d0a 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39978,6 +39978,12 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     }
   }
 
+  // For broadcasts, unless we *only* demand the 0'th element,
+  // stop attempts at simplification here, we aren't going to improve things,
+  // this is better than any potential shuffle.
+  if (isTargetShuffleSplat(Op) && !DemandedElts.isOneValue())
+    return false;
+
   // Get target/faux shuffle mask.
   APInt OpUndef, OpZero;
   SmallVector<int, 64> OpMask;

diff  --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
index c7b1b92918b1..e62444202024 100644
--- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
+++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
@@ -284,7 +284,7 @@ define <4 x i64> @vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_unary(<4
 define <4 x i64> @vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_binary(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; CHECK-LABEL: vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_binary:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; CHECK-NEXT:    vbroadcastsd %xmm1, %ymm1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; CHECK-NEXT:    retq
   %r = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 4>

diff  --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 6d053e9f7834..88691eedcab5 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -752,9 +752,9 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX2-SLOW-NEXT:    vpbroadcastd %xmm3, %xmm4
-; AVX2-SLOW-NEXT:    vpbroadcastq %xmm3, %xmm5
-; AVX2-SLOW-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
+; AVX2-SLOW-NEXT:    vpbroadcastd %xmm3, %xmm5
+; AVX2-SLOW-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3]

diff  --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 22f428b0604d..6803334e89a2 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -645,8 +645,7 @@ define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocaptu
 ; X86-AVX1-LABEL: pinsrd_from_shufflevector_i32:
 ; X86-AVX1:       ## %bb.0: ## %entry
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vpermilps $0, (%eax), %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0x08,0x00]
-; X86-AVX1-NEXT:    ## xmm1 = mem[0,0,0,0]
+; X86-AVX1-NEXT:    vbroadcastss (%eax), %xmm1 ## encoding: [0xc4,0xe2,0x79,0x18,0x08]
 ; X86-AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
 ; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
@@ -669,8 +668,7 @@ define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocaptu
 ;
 ; X64-AVX1-LABEL: pinsrd_from_shufflevector_i32:
 ; X64-AVX1:       ## %bb.0: ## %entry
-; X64-AVX1-NEXT:    vpermilps $0, (%rdi), %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0x0f,0x00]
-; X64-AVX1-NEXT:    ## xmm1 = mem[0,0,0,0]
+; X64-AVX1-NEXT:    vbroadcastss (%rdi), %xmm1 ## encoding: [0xc4,0xe2,0x79,0x18,0x0f]
 ; X64-AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
 ; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]