[llvm-commits] [llvm] r137238 - in /llvm/trunk: lib/Target/X86/X86ISelLowering.cpp test/CodeGen/X86/opt-shuff-tstore.ll

Wed Aug 10 12:30:14 PDT 2011

Author: nadav
Date: Wed Aug 10 14:30:14 2011
New Revision: 137238

URL: http://llvm.org/viewvc/llvm-project?rev=137238&view=rev
Log:
When performing a truncating store, it is sometimes possible to rearrange the
data in-register prior to saving to memory.  When we reorder the data in memory
we prevent the need to save multiple scalars to memory, making a single regular
store.


Added:
    llvm/trunk/test/CodeGen/X86/opt-shuff-tstore.ll
Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=137238&r1=137237&r2=137238&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Aug 10 14:30:14 2011
@@ -12574,14 +12574,91 @@
 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
                                    const X86Subtarget *Subtarget) {
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  EVT VT = St->getValue().getValueType();
+  EVT StVT = St->getMemoryVT();
+  DebugLoc dl = St->getDebugLoc();
+
+  // Optimize trunc store (of multiple scalars) to shuffle and store.
+  // First, pack all of the elements in one place. Next, store to memory
+  // in fewer chunks.
+  if (St->isTruncatingStore() && VT.isVector()) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned NumElems = VT.getVectorNumElements();
+    assert(StVT != VT && "Cannot truncate to the same type");
+    unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+    unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+    // From, To sizes and ElemCount must be pow of two
+    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
+    // We are going to use the original vector elt for storing.
+    // accumulated smaller vector elements must be a multiple of bigger size.
+    if (0 != (NumElems * ToSz) % FromSz) return SDValue();
+    unsigned SizeRatio  = FromSz / ToSz;
+
+    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+    // Create a type on which we perform the shuffle
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StVT.getScalarType(), NumElems*SizeRatio);
+
+    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio;
+
+    // Can't shuffle using an illegal type
+    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+                                DAG.getUNDEF(WideVec.getValueType()),
+                                ShuffleVec.data());
+    // At this point all of the data is stored at the bottom of the
+    // register. We now need to save it to mem.
+
+    // Find the largest store unit
+    MVT StoreType = MVT::i8;
+    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+      MVT Tp = (MVT::SimpleValueType)tp;
+      if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz)
+        StoreType = Tp;
+    }
+
+    // Bitcast the original vector into a vector of store-size units
+    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
+    SmallVector<SDValue, 8> Chains;
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue Ptr = St->getBasePtr();
+
+    // Perform one or more big stores into memory.
+    for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) {
+      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                                   StoreType, ShuffWide,
+                                   DAG.getIntPtrConstant(i));
+      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+      Chains.push_back(Ch);
+    }
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+                               Chains.size());
+  }
+
+
   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
   // the FP state in cases where an emms may be missing.
   // A preferable solution to the general problem is to figure out the right
   // places to insert EMMS.  This qualifies as a quick hack.
 
   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
-  StoreSDNode *St = cast<StoreSDNode>(N);
-  EVT VT = St->getValue().getValueType();
   if (VT.getSizeInBits() != 64)
     return SDValue();
 

Added: llvm/trunk/test/CodeGen/X86/opt-shuff-tstore.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/opt-shuff-tstore.ll?rev=137238&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/opt-shuff-tstore.ll (added)
+++ llvm/trunk/test/CodeGen/X86/opt-shuff-tstore.ll Wed Aug 10 14:30:14 2011
@@ -0,0 +1,39 @@
+; RUN: llc -mcpu=corei7 < %s -o - -promote-elements -mattr=+sse2,+sse41 | FileCheck %s
+
+; CHECK: func_4_8
+; A single memory write
+; CHECK: movd
+; CHECK-NEXT: ret
+define void @func_4_8(<4 x i8> %param, <4 x i8>* %p) {
+  %r = add <4 x i8> %param, <i8 1, i8 2, i8 3, i8 4>
+  store <4 x i8> %r, <4 x i8>* %p
+  ret void
+}
+
+; CHECK: func_4_16
+; CHECK: movq
+; CHECK-NEXT: ret
+define void @func_4_16(<4 x i16> %param, <4 x i16>* %p) {
+  %r = add <4 x i16> %param, <i16 1, i16 2, i16 3, i16 4>
+  store <4 x i16> %r, <4 x i16>* %p
+  ret void
+}
+
+; CHECK: func_8_8
+; CHECK: movq
+; CHECK-NEXT: ret
+define void @func_8_8(<8 x i8> %param, <8 x i8>* %p) {
+  %r = add <8 x i8> %param, <i8 1, i8 2, i8 3, i8 4, i8 1, i8 2, i8 3, i8 4>
+  store <8 x i8> %r, <8 x i8>* %p
+  ret void
+}
+
+; CHECK: func_2_32
+; CHECK: movq
+; CHECK-NEXT: ret
+define void @func_2_32(<2 x i32> %param, <2 x i32>* %p) {
+  %r = add <2 x i32> %param, <i32 1, i32 2>
+  store <2 x i32> %r, <2 x i32>* %p
+  ret void
+}
+