[llvm] r228006 - Merge consecutive 16-byte loads into one 32-byte load (PR22329)

Tue Feb 3 10:54:00 PST 2015

Author: spatel
Date: Tue Feb  3 12:54:00 2015
New Revision: 228006

URL: http://llvm.org/viewvc/llvm-project?rev=228006&view=rev
Log:
Merge consecutive 16-byte loads into one 32-byte load (PR22329)

This patch detects consecutive vector loads using the existing 
EltsFromConsecutiveLoads() logic. This fixes:
http://llvm.org/bugs/show_bug.cgi?id=22329

This patch effectively reverts the tablegen additions of D6492 / 
http://reviews.llvm.org/rL224344 ...which in hindsight were a horrible hack.

The test cases that were added with that patch are simply modified to load
from varying offsets of a base pointer. These loads did not match the existing
tablegen patterns.

A happy side effect of doing this optimization earlier is that we can now fold
the load into a math op where possible; this is shown in some of the updated
checks in the test file.

Differential Revision: http://reviews.llvm.org/D7303


Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86InstrSSE.td
    llvm/trunk/test/CodeGen/X86/unaligned-32-byte-memops.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=228006&r1=228005&r2=228006&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Feb  3 12:54:00 2015
@@ -6011,9 +6011,9 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MV
   return SDValue();
 }
 
-/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
-/// vector of type 'VT', see if the elements can be replaced by a single large
-/// load which has the same value as a build_vector whose operands are 'elts'.
+/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
+/// elements can be replaced by a single large load which has the same value as
+/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
 ///
 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
 ///
@@ -6023,7 +6023,6 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MV
 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
                                         SDLoc &DL, SelectionDAG &DAG,
                                         bool isAfterLegalize) {
-  EVT EltVT = VT.getVectorElementType();
   unsigned NumElems = Elts.size();
 
   LoadSDNode *LDBase = nullptr;
@@ -6034,7 +6033,9 @@ static SDValue EltsFromConsecutiveLoads(
   // non-consecutive, bail out.
   for (unsigned i = 0; i < NumElems; ++i) {
     SDValue Elt = Elts[i];
-
+    // Look through a bitcast.
+    if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
+      Elt = Elt.getOperand(0);
     if (!Elt.getNode() ||
         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
       return SDValue();
@@ -6049,7 +6050,12 @@ static SDValue EltsFromConsecutiveLoads(
       continue;
 
     LoadSDNode *LD = cast<LoadSDNode>(Elt);
-    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+    EVT LdVT = Elt.getValueType();
+    // Each loaded element must be the correct fractional portion of the
+    // requested vector load.
+    if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
+      return SDValue();
+    if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
       return SDValue();
     LastLoadedElt = i;
   }
@@ -6058,6 +6064,12 @@ static SDValue EltsFromConsecutiveLoads(
   // load of the entire vector width starting at the base pointer.  If we found
   // consecutive loads for the low half, generate a vzext_load node.
   if (LastLoadedElt == NumElems - 1) {
+    assert(LDBase && "Did not find base load for merging consecutive loads");
+    EVT EltVT = LDBase->getValueType(0);
+    // Ensure that the input vector size for the merged loads matches the
+    // cumulative size of the input elements.
+    if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
+      return SDValue();
 
     if (isAfterLegalize &&
         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
@@ -6084,6 +6096,7 @@ static SDValue EltsFromConsecutiveLoads(
 
   //TODO: The code below fires only for for loading the low v2i32 / v2f32
   //of a v4i32 / v4f32. It's probably worth generalizing.
+  EVT EltVT = VT.getVectorElementType();
   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
@@ -13164,25 +13177,44 @@ static SDValue LowerINSERT_SUBVECTOR(SDV
                                      SelectionDAG &DAG) {
   if (!Subtarget->hasAVX())
     return SDValue();
-  
+
   SDLoc dl(Op);
   SDValue Vec = Op.getOperand(0);
   SDValue SubVec = Op.getOperand(1);
   SDValue Idx = Op.getOperand(2);
+
+  if (!isa<ConstantSDNode>(Idx))
+    return SDValue();
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   MVT OpVT = Op.getSimpleValueType();
   MVT SubVecVT = SubVec.getSimpleValueType();
-    
+
+  // Fold two 16-byte subvector loads into one 32-byte load:
+  // (insert_subvector (insert_subvector undef, (load addr), 0),
+  //                   (load addr + 16), Elts/2)
+  // --> load32 addr
+  if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+      Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
+      !Subtarget->isUnalignedMem32Slow()) {
+    SDValue SubVec2 = Vec.getOperand(1);
+    if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
+      if (Idx2->getZExtValue() == 0) {
+        SDValue Ops[] = { SubVec2, SubVec };
+        SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
+        if (LD.getNode())
+          return LD;
+      }
+    }
+  }
+
   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
-      SubVecVT.is128BitVector() && isa<ConstantSDNode>(Idx)) {
-    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+      SubVecVT.is128BitVector())
     return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
-  }
 
-  if (OpVT.is512BitVector() &&
-      SubVecVT.is256BitVector() && isa<ConstantSDNode>(Idx)) {
-    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
     return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
-  }
 
   return SDValue();
 }

Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=228006&r1=228005&r2=228006&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Tue Feb  3 12:54:00 2015
@@ -8141,49 +8141,6 @@ def : Pat<(vinsert128_insert:$ins (v4f64
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 }
 
-// Combine two consecutive 16-byte loads with a common destination register into
-// one 32-byte load to that register.
-let Predicates = [HasAVX, HasFastMem32] in {
-  def : Pat<(insert_subvector
-              (v8f32 (insert_subvector undef, (loadv4f32 addr:$src), (iPTR 0))),
-              (loadv4f32 (add addr:$src, (iPTR 16))),
-              (iPTR 4)),
-            (VMOVUPSYrm addr:$src)>;
-
-  def : Pat<(insert_subvector
-              (v4f64 (insert_subvector undef, (loadv2f64 addr:$src), (iPTR 0))),
-              (loadv2f64 (add addr:$src, (iPTR 16))),
-              (iPTR 2)),
-            (VMOVUPDYrm addr:$src)>;
-
-  def : Pat<(insert_subvector
-              (v32i8 (insert_subvector
-                undef, (bc_v16i8 (loadv2i64 addr:$src)), (iPTR 0))),
-              (bc_v16i8 (loadv2i64 (add addr:$src, (iPTR 16)))),
-              (iPTR 16)),
-            (VMOVDQUYrm addr:$src)>;
-
-  def : Pat<(insert_subvector
-              (v16i16 (insert_subvector
-                undef, (bc_v8i16 (loadv2i64 addr:$src)), (iPTR 0))),
-              (bc_v8i16 (loadv2i64 (add addr:$src, (iPTR 16)))),
-              (iPTR 8)),
-            (VMOVDQUYrm addr:$src)>;
-
-  def : Pat<(insert_subvector
-              (v8i32 (insert_subvector
-                undef, (bc_v4i32 (loadv2i64 addr:$src)), (iPTR 0))),
-              (bc_v4i32 (loadv2i64 (add addr:$src, (iPTR 16)))),
-              (iPTR 4)),
-            (VMOVDQUYrm addr:$src)>;
-
-  def : Pat<(insert_subvector
-              (v4i64 (insert_subvector undef, (loadv2i64 addr:$src), (iPTR 0))),
-              (loadv2i64 (add addr:$src, (iPTR 16))),
-              (iPTR 2)),
-            (VMOVDQUYrm addr:$src)>;
-}
-
 let Predicates = [HasAVX1Only] in {
 def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
                                    (iPTR imm)),

Modified: llvm/trunk/test/CodeGen/X86/unaligned-32-byte-memops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/unaligned-32-byte-memops.ll?rev=228006&r1=228005&r2=228006&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/unaligned-32-byte-memops.ll (original)
+++ llvm/trunk/test/CodeGen/X86/unaligned-32-byte-memops.ll Tue Feb  3 12:54:00 2015
@@ -65,8 +65,9 @@ define <8 x float> @combine_16_byte_load
   ; HASWELL: vmovups
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
-  %v1 = load <4 x float>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 1
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 2
+  %v1 = load <4 x float>* %ptr1, align 1
   %v2 = load <4 x float>* %ptr2, align 1
   %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
@@ -88,8 +89,9 @@ define <8 x float> @combine_16_byte_load
   ; HASWELL: vmovups
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
-  %v1 = load <4 x float>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 2
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 3
+  %v1 = load <4 x float>* %ptr1, align 1
   %v2 = load <4 x float>* %ptr2, align 1
   %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
   %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
@@ -111,8 +113,9 @@ define <8 x float> @combine_16_byte_load
   ; HASWELL: vmovups
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
-  %v1 = load <4 x float>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 3
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 4
+  %v1 = load <4 x float>* %ptr1, align 1
   %v2 = load <4 x float>* %ptr2, align 1
   %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %v3
@@ -133,8 +136,9 @@ define <8 x float> @combine_16_byte_load
   ; HASWELL: vmovups
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1
-  %v1 = load <4 x float>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 4
+  %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 5
+  %v1 = load <4 x float>* %ptr1, align 1
   %v2 = load <4 x float>* %ptr2, align 1
   %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <8 x float> %v3
@@ -160,12 +164,13 @@ define <4 x i64> @combine_16_byte_loads_
   ; BTVER2-NEXT: vinsertf128
   ; BTVER2-NEXT: retq
 
-  ; HASWELL: vmovdqu
-  ; HASWELL-NEXT: vpaddq
+  ; HASWELL-NOT: vextract
+  ; HASWELL: vpaddq
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 1
-  %v1 = load <2 x i64>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <2 x i64>* %ptr, i64 5
+  %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 6
+  %v1 = load <2 x i64>* %ptr1, align 1
   %v2 = load <2 x i64>* %ptr2, align 1
   %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %v4 = add <4 x i64> %v3, %x
@@ -187,12 +192,13 @@ define <8 x i32> @combine_16_byte_loads_
   ; BTVER2-NEXT: vinsertf128
   ; BTVER2-NEXT: retq
 
-  ; HASWELL: vmovdqu
-  ; HASWELL-NEXT: vpaddd
+  ; HASWELL-NOT: vextract
+  ; HASWELL: vpaddd
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 1
-  %v1 = load <4 x i32>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <4 x i32>* %ptr, i64 6
+  %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 7
+  %v1 = load <4 x i32>* %ptr1, align 1
   %v2 = load <4 x i32>* %ptr2, align 1
   %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %v4 = add <8 x i32> %v3, %x
@@ -214,12 +220,13 @@ define <16 x i16> @combine_16_byte_loads
   ; BTVER2-NEXT: vinsertf128
   ; BTVER2-NEXT: retq
 
-  ; HASWELL: vmovdqu
-  ; HASWELL-NEXT: vpaddw
+  ; HASWELL-NOT: vextract
+  ; HASWELL: vpaddw
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 1
-  %v1 = load <8 x i16>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <8 x i16>* %ptr, i64 7
+  %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 8
+  %v1 = load <8 x i16>* %ptr1, align 1
   %v2 = load <8 x i16>* %ptr2, align 1
   %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %v4 = add <16 x i16> %v3, %x
@@ -241,12 +248,13 @@ define <32 x i8> @combine_16_byte_loads_
   ; BTVER2-NEXT: vinsertf128
   ; BTVER2-NEXT: retq
 
-  ; HASWELL: vmovdqu
-  ; HASWELL-NEXT: vpaddb
+  ; HASWELL-NOT: vextract
+  ; HASWELL: vpaddb
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 1
-  %v1 = load <16 x i8>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <16 x i8>* %ptr, i64 8
+  %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 9
+  %v1 = load <16 x i8>* %ptr1, align 1
   %v2 = load <16 x i8>* %ptr2, align 1
   %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %v4 = add <32 x i8> %v3, %x
@@ -261,16 +269,17 @@ define <4 x double> @combine_16_byte_loa
   ; SANDYB-NEXT: vaddpd
   ; SANDYB-NEXT: retq
 
-  ; BTVER2: vmovupd
-  ; BTVER2-NEXT: vaddpd
+  ; BTVER2-NOT: vinsertf128
+  ; BTVER2: vaddpd
   ; BTVER2-NEXT: retq
 
-  ; HASWELL: vmovupd
+  ; HASWELL-NOT: vinsertf128
   ; HASWELL: vaddpd
   ; HASWELL-NEXT: retq
 
-  %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 1
-  %v1 = load <2 x double>* %ptr, align 1
+  %ptr1 = getelementptr inbounds <2 x double>* %ptr, i64 9
+  %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 10
+  %v1 = load <2 x double>* %ptr1, align 1
   %v2 = load <2 x double>* %ptr2, align 1
   %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %v4 = fadd <4 x double> %v3, %x