[llvm] 918e653 - [X86] Immediately call LowerShift from lowerBuildVectorToBitOp.

Thu Jul 9 10:52:00 PDT 2020

Author: Craig Topper
Date: 2020-07-09T10:51:29-07:00
New Revision: 918e6531863187d65895fd68bbc622369b3d79f3

URL: https://github.com/llvm/llvm-project/commit/918e6531863187d65895fd68bbc622369b3d79f3
DIFF: https://github.com/llvm/llvm-project/commit/918e6531863187d65895fd68bbc622369b3d79f3.diff

LOG: [X86] Immediately call LowerShift from lowerBuildVectorToBitOp.

If we don't immediately lower the vector shift, the splat
constant vector we created may get turned into a constant pool
load before we get around to lowering the shift. This makes it
a lot more difficult to create a shift by constant. Sometimes we
fail to see through the constant pool at all and end up trying
to lower as if it was a variable shift. This requires custom
handling and may create an unsupported vselect on pre-sse-4.1
targets. Since we're after LegalizeVectorOps we are unable to
legalize the unsupported vselect as that code is in LegalizeVectorOps
rather than LegalizeDAG.

So calling LowerShift immediately ensures that we get see the
splat constant.

Fixes PR46527.

Differential Revision: https://reviews.llvm.org/D83455

Added: 
    llvm/test/CodeGen/X86/pr46527.ll

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cfbf38c56bc9..cdd7ba1ec432 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9689,6 +9689,9 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
   return SDValue();
 }
 
+static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
+                          SelectionDAG &DAG);
+
 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
 /// just apply the bit to the vectors.
@@ -9696,6 +9699,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
 /// from this, but enough scalar bit operations are created from the later
 /// legalization + scalarization stages to need basic support.
 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   SDLoc DL(Op);
   MVT VT = Op->getSimpleValueType(0);
@@ -9759,7 +9763,14 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
 
   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
-  return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+  SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
+
+  if (!IsShift)
+    return Res;
+
+  // Immediately lower the shift to ensure the constant build vector doesn't
+  // get converted to a constant pool before the shift is lowered.
+  return LowerShift(Res, Subtarget, DAG);
 }
 
 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
@@ -10115,7 +10126,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     return HorizontalOp;
   if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
     return Broadcast;
-  if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
+  if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
     return BitOp;
 
   unsigned EVTBits = EltVT.getSizeInBits();

diff  --git a/llvm/test/CodeGen/X86/pr46527.ll b/llvm/test/CodeGen/X86/pr46527.ll
new file mode 100644
index 000000000000..1d3f16f8c1ae
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr46527.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 -relocation-model=pic | FileCheck %s
+
+define void @f(<16 x i8>* %out, <16 x i8> %in, i1 %flag) {
+; CHECK-LABEL: f:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    calll .L0$pb
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:  .L0$pb:
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    .cfi_adjust_cfa_offset -4
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:    notb %dl
+; CHECK-NEXT:    andb $1, %dl
+; CHECK-NEXT:    movzbl %dl, %edx
+; CHECK-NEXT:    movd %edx, %xmm1
+; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; CHECK-NEXT:    paddb %xmm1, %xmm1
+; CHECK-NEXT:    pxor %xmm0, %xmm1
+; CHECK-NEXT:    pxor {{\.LCPI.*}}@GOTOFF(%eax), %xmm1
+; CHECK-NEXT:    movdqa %xmm1, (%ecx)
+; CHECK-NEXT:    retl
+entry:
+  %0 = select i1 %flag, i8 0, i8 2
+  %1 = insertelement <16 x i8> undef, i8 %0, i32 0
+  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  %3 = xor <16 x i8> %2, %in
+  %4 = xor <16 x i8> %3, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  store <16 x i8> %4, <16 x i8>* %out, align 16
+  ret void
+}