[llvm-commits] [llvm] r139995 - in /llvm/trunk: lib/Target/X86/X86ISelLowering.cpp test/CodeGen/X86/trunc-ext-ld-st.ll
Nadav Rotem
nadav.rotem at intel.com
Sun Sep 18 03:39:32 PDT 2011
Author: nadav
Date: Sun Sep 18 05:39:32 2011
New Revision: 139995
URL: http://llvm.org/viewvc/llvm-project?rev=139995&view=rev
Log:
When promoting integer vectors we often create ext-loads. This patch adds a
dag-combine optimization to implement the ext-load efficiently (using shuffles).
For example the type <4 x i8> is stored in memory as i32, but it needs to
find its way into a <4 x i32> register. Previously we scalarized the memory
access, now we use shuffles.
Added:
llvm/trunk/test/CodeGen/X86/trunc-ext-ld-st.ll
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=139995&r1=139994&r2=139995&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Sep 18 05:39:32 2011
@@ -1138,6 +1138,7 @@
setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::SUB);
+ setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::SINT_TO_FP);
@@ -13433,6 +13434,89 @@
return SDValue();
}
+/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
+static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ EVT RegVT = Ld->getValueType(0);
+ EVT MemVT = Ld->getMemoryVT();
+ DebugLoc dl = Ld->getDebugLoc();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ ISD::LoadExtType Ext = Ld->getExtensionType();
+
+ // If yhis is a vector EXT Load then attempt to optimize it using a
+ // shuffle. We need SSE4 for the shuffles.
+ // TODO: It is possible to support ZExt by zeroing the undef values
+ // during the shuffle phase or after the shuffle.
+ if (RegVT.isVector() && Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
+ assert(MemVT != RegVT && "Cannot extend to the same type");
+ assert(MemVT.isVector() && "Must load a vector from memory");
+
+ unsigned NumElems = RegVT.getVectorNumElements();
+ unsigned RegSz = RegVT.getSizeInBits();
+ unsigned MemSz = MemVT.getSizeInBits();
+ assert(RegSz > MemSz && "Register size must be greater than the mem size");
+ // All sized must be a power of two
+ if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue();
+
+ // Attempt to load the original value using a single load op.
+ // Find a scalar type which is equal to the loaded word size.
+ MVT SclrLoadTy = MVT::i8;
+ for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+ tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+ MVT Tp = (MVT::SimpleValueType)tp;
+ if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() == MemSz) {
+ SclrLoadTy = Tp;
+ break;
+ }
+ }
+
+ // Proceed if a load word is found.
+ if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue();
+
+ EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
+ RegSz/SclrLoadTy.getSizeInBits());
+
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+ RegSz/MemVT.getScalarType().getSizeInBits());
+ // Can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+ // Perform a single load.
+ SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
+ Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->isVolatile(),
+ Ld->isNonTemporal(), Ld->getAlignment());
+
+ // Insert the word loaded into a vector.
+ SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ LoadUnitVecVT, ScalarLoad);
+
+ // Bitcast the loaded value to a vector of the original element type, in
+ // the size of the target vector type.
+ SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, ScalarInVector);
+ unsigned SizeRatio = RegSz/MemSz;
+
+ // Redistribute the loaded elements into the different locations.
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i;
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+ DAG.getUNDEF(SlicedVec.getValueType()),
+ ShuffleVec.data());
+
+ // Bitcast to the requested type.
+ Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+ // Replace the original load with the new sequence
+ // and return the new chain.
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff);
+ return SDValue(ScalarLoad.getNode(), 1);
+ }
+
+ return SDValue();
+}
+
/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
@@ -13479,9 +13563,7 @@
// From, To sizes and ElemCount must be pow of two
if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
- // We are going to use the original vector elt for storing.
- // accumulated smaller vector elements must be a multiple of bigger size.
- if (0 != (NumElems * ToSz) % FromSz) return SDValue();
+
unsigned SizeRatio = FromSz / ToSz;
assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
@@ -13885,6 +13967,7 @@
case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget);
case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
+ case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget);
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
case X86ISD::FXOR:
Added: llvm/trunk/test/CodeGen/X86/trunc-ext-ld-st.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/trunc-ext-ld-st.ll?rev=139995&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/trunc-ext-ld-st.ll (added)
+++ llvm/trunk/test/CodeGen/X86/trunc-ext-ld-st.ll Sun Sep 18 05:39:32 2011
@@ -0,0 +1,82 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -promote-elements -mattr=+sse41 | FileCheck %s
+
+;CHECK: load_2_i8
+; A single 16-bit load
+;CHECK: movzwl
+;CHECK: pshufb
+;CHECK: paddq
+;CHECK: pshufb
+; A single 16-bit store
+;CHECK: movw
+;CHECK: ret
+
+define void @load_2_i8(<2 x i8>* %A) {
+ %T = load <2 x i8>* %A
+ %G = add <2 x i8> %T, <i8 9, i8 7>
+ store <2 x i8> %G, <2 x i8>* %A
+ ret void
+}
+
+;CHECK: load_2_i16
+; Read 32-bits
+;CHECK: movd
+;CHECK: pshufb
+;CHECK: paddq
+;CHECK: pshufb
+;CHECK: movd
+;CHECK: ret
+define void @load_2_i16(<2 x i16>* %A) {
+ %T = load <2 x i16>* %A
+ %G = add <2 x i16> %T, <i16 9, i16 7>
+ store <2 x i16> %G, <2 x i16>* %A
+ ret void
+}
+
+;CHECK: load_2_i32
+;CHECK: pshufd
+;CHECK: paddq
+;CHECK: pshufd
+;CHECK: ret
+define void @load_2_i32(<2 x i32>* %A) {
+ %T = load <2 x i32>* %A
+ %G = add <2 x i32> %T, <i32 9, i32 7>
+ store <2 x i32> %G, <2 x i32>* %A
+ ret void
+}
+
+;CHECK: load_4_i8
+;CHECK: movd
+;CHECK: pshufb
+;CHECK: paddd
+;CHECK: pshufb
+;CHECK: ret
+define void @load_4_i8(<4 x i8>* %A) {
+ %T = load <4 x i8>* %A
+ %G = add <4 x i8> %T, <i8 1, i8 4, i8 9, i8 7>
+ store <4 x i8> %G, <4 x i8>* %A
+ ret void
+}
+
+;CHECK: load_4_i16
+;CHECK: punpcklwd
+;CHECK: paddd
+;CHECK: pshufb
+;CHECK: ret
+define void @load_4_i16(<4 x i16>* %A) {
+ %T = load <4 x i16>* %A
+ %G = add <4 x i16> %T, <i16 1, i16 4, i16 9, i16 7>
+ store <4 x i16> %G, <4 x i16>* %A
+ ret void
+}
+
+;CHECK: load_8_i8
+;CHECK: punpcklbw
+;CHECK: paddw
+;CHECK: pshufb
+;CHECK: ret
+define void @load_8_i8(<8 x i8>* %A) {
+ %T = load <8 x i8>* %A
+ %G = add <8 x i8> %T, %T
+ store <8 x i8> %G, <8 x i8>* %A
+ ret void
+}
More information about the llvm-commits
mailing list