[llvm] r290365 - Redo store splitting in CodeGenPrepare.

Thu Dec 22 11:44:45 PST 2016

Author: wmi
Date: Thu Dec 22 13:44:45 2016
New Revision: 290365

URL: http://llvm.org/viewvc/llvm-project?rev=290365&view=rev
Log:
Redo store splitting in CodeGenPrepare.

This is a succeeding patch of https://reviews.llvm.org/D22840 to address the
issue when a value to be merged into an int64 pair is in a different BB. Redoing
the store splitting in CodeGenPrepare so we can match the pattern across multiple
BBs and move some instructions into the same BB. We still keep the code in dag
combine so that we can catch cases that show up after DAG combining runs.

Differential Revision: https://reviews.llvm.org/D25914


Modified:
    llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
    llvm/trunk/test/CodeGen/X86/split-store.ll

Modified: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp?rev=290365&r1=290364&r2=290365&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp (original)
+++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp Thu Dec 22 13:44:45 2016
@@ -131,6 +131,10 @@ static cl::opt<unsigned> FreqRatioToSkip
     cl::desc("Skip merging empty blocks if (frequency of empty block) / "
              "(frequency of destination block) is greater than this ratio"));
 
+static cl::opt<bool> ForceSplitStore(
+    "force-split-store", cl::Hidden, cl::init(false),
+    cl::desc("Force store splitting no matter what the target query says."));
+
 namespace {
 typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
 typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;
@@ -5358,6 +5362,117 @@ bool CodeGenPrepare::optimizeExtractElem
   return false;
 }
 
+/// For the instruction sequence of store below, F and I values
+/// are bundled together as an i64 value before being stored into memory.
+/// Sometimes it is more efficent to generate separate stores for F and I,
+/// which can remove the bitwise instructions or sink them to colder places.
+///
+///   (store (or (zext (bitcast F to i32) to i64),
+///              (shl (zext I to i64), 32)), addr)  -->
+///   (store F, addr) and (store I, addr+4)
+///
+/// Similarly, splitting for other merged store can also be beneficial, like:
+/// For pair of {i32, i32}, i64 store --> two i32 stores.
+/// For pair of {i32, i16}, i64 store --> two i32 stores.
+/// For pair of {i16, i16}, i32 store --> two i16 stores.
+/// For pair of {i16, i8},  i32 store --> two i16 stores.
+/// For pair of {i8, i8},   i16 store --> two i8 stores.
+///
+/// We allow each target to determine specifically which kind of splitting is
+/// supported.
+///
+/// The store patterns are commonly seen from the simple code snippet below
+/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
+///   void goo(const std::pair<int, float> &);
+///   hoo() {
+///     ...
+///     goo(std::make_pair(tmp, ftmp));
+///     ...
+///   }
+///
+/// Although we already have similar splitting in DAG Combine, we duplicate
+/// it in CodeGenPrepare to catch the case in which pattern is across
+/// multiple BBs. The logic in DAG Combine is kept to catch case generated
+/// during code expansion.
+static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
+                                const TargetLowering &TLI) {
+  // Handle simple but common cases only.
+  Type *StoreType = SI.getValueOperand()->getType();
+  if (DL.getTypeStoreSizeInBits(StoreType) != DL.getTypeSizeInBits(StoreType) ||
+      DL.getTypeSizeInBits(StoreType) == 0)
+    return false;
+
+  unsigned HalfValBitSize = DL.getTypeSizeInBits(StoreType) / 2;
+  Type *SplitStoreType = Type::getIntNTy(SI.getContext(), HalfValBitSize);
+  if (DL.getTypeStoreSizeInBits(SplitStoreType) !=
+      DL.getTypeSizeInBits(SplitStoreType))
+    return false;
+
+  // Match the following patterns:
+  // (store (or (zext LValue to i64),
+  //            (shl (zext HValue to i64), 32)), HalfValBitSize)
+  //  or
+  // (store (or (shl (zext HValue to i64), 32)), HalfValBitSize)
+  //            (zext LValue to i64),
+  // Expect both operands of OR and the first operand of SHL have only
+  // one use.
+  Value *LValue, *HValue;
+  if (!match(SI.getValueOperand(),
+             m_c_Or(m_OneUse(m_ZExt(m_Value(LValue))),
+                    m_OneUse(m_Shl(m_OneUse(m_ZExt(m_Value(HValue))),
+                                   m_SpecificInt(HalfValBitSize))))))
+    return false;
+
+  // Check LValue and HValue are int with size less or equal than 32.
+  if (!LValue->getType()->isIntegerTy() ||
+      DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize ||
+      !HValue->getType()->isIntegerTy() ||
+      DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize)
+    return false;
+
+  // If LValue/HValue is a bitcast instruction, use the EVT before bitcast
+  // as the input of target query.
+  auto *LBC = dyn_cast<BitCastInst>(LValue);
+  auto *HBC = dyn_cast<BitCastInst>(HValue);
+  EVT LowTy = LBC ? EVT::getEVT(LBC->getOperand(0)->getType())
+                  : EVT::getEVT(LValue->getType());
+  EVT HighTy = HBC ? EVT::getEVT(HBC->getOperand(0)->getType())
+                   : EVT::getEVT(HValue->getType());
+  if (!ForceSplitStore && !TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
+    return false;
+
+  // Start to split store.
+  IRBuilder<> Builder(SI.getContext());
+  Builder.SetInsertPoint(&SI);
+
+  // If LValue/HValue is a bitcast in another BB, create a new one in current
+  // BB so it may be merged with the splitted stores by dag combiner.
+  if (LBC && LBC->getParent() != SI.getParent())
+    LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType());
+  if (HBC && HBC->getParent() != SI.getParent())
+    HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType());
+
+  auto CreateSplitStore = [&](Value *V, bool Upper) {
+    V = Builder.CreateZExtOrBitCast(V, SplitStoreType);
+    Value *Addr = Builder.CreateBitCast(
+        SI.getOperand(1),
+        SplitStoreType->getPointerTo(SI.getPointerAddressSpace()));
+    if (Upper)
+      Addr = Builder.CreateGEP(
+          SplitStoreType, Addr,
+          ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));
+    Builder.CreateAlignedStore(
+        V, Addr, Upper ? SI.getAlignment() / 2 : SI.getAlignment());
+  };
+
+  CreateSplitStore(LValue, false);
+  CreateSplitStore(HValue, true);
+
+  // Delete the old store.
+  SI.eraseFromParent();
+  return true;
+}
+
 bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
   // Bail out if we inserted the instruction to prevent optimizations from
   // stepping on each other's toes.
@@ -5422,6 +5537,8 @@ bool CodeGenPrepare::optimizeInst(Instru
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (TLI && splitMergedValStore(*SI, *DL, *TLI))
+      return true;
     SI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
     if (TLI) {
       unsigned AS = SI->getPointerAddressSpace();

Modified: llvm/trunk/test/CodeGen/X86/split-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/split-store.ll?rev=290365&r1=290364&r2=290365&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/split-store.ll (original)
+++ llvm/trunk/test/CodeGen/X86/split-store.ll Thu Dec 22 13:44:45 2016
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -force-split-store < %s | FileCheck %s
 
 ; CHECK-LABEL: int32_float_pair
 ; CHECK: movl %edi, (%rsi)
@@ -57,3 +57,200 @@ entry:
   store i64 %t4, i64* %ref.tmp, align 8
   ret void
 }
+
+; CHECK-LABEL: int32_int32_pair
+; CHECK: movl	%edi, (%rdx)
+; CHECK: movl	%esi, 4(%rdx)
+define void @int32_int32_pair(i32 %tmp1, i32 %tmp2, i64* %ref.tmp) {
+entry:
+  %t1 = zext i32 %tmp2 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i32 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: int16_int16_pair
+; CHECK: movw	%di, (%rdx)
+; CHECK: movw	%si, 2(%rdx)
+define void @int16_int16_pair(i16 signext %tmp1, i16 signext %tmp2, i32* %ref.tmp) {
+entry:
+  %t1 = zext i16 %tmp2 to i32
+  %t2 = shl nuw i32 %t1, 16
+  %t3 = zext i16 %tmp1 to i32
+  %t4 = or i32 %t2, %t3
+  store i32 %t4, i32* %ref.tmp, align 4
+  ret void
+}
+
+; CHECK-LABEL: int8_int8_pair
+; CHECK: movb	%dil, (%rdx)
+; CHECK: movb	%sil, 1(%rdx)
+define void @int8_int8_pair(i8 signext %tmp1, i8 signext %tmp2, i16* %ref.tmp) {
+entry:
+  %t1 = zext i8 %tmp2 to i16
+  %t2 = shl nuw i16 %t1, 8
+  %t3 = zext i8 %tmp1 to i16
+  %t4 = or i16 %t2, %t3
+  store i16 %t4, i16* %ref.tmp, align 2
+  ret void
+}
+
+; CHECK-LABEL: int31_int31_pair
+; CHECK: andl $2147483647, %edi
+; CHECK: movl %edi, (%rdx)
+; CHECK: andl $2147483647, %esi
+; CHECK: movl %esi, 4(%rdx)
+define void @int31_int31_pair(i31 %tmp1, i31 %tmp2, i64* %ref.tmp) {
+entry:
+  %t1 = zext i31 %tmp2 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i31 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: int31_int17_pair
+; CHECK: andl $2147483647, %edi
+; CHECK: movl %edi, (%rdx)
+; CHECK: andl $131071, %esi
+; CHECK: movl %esi, 4(%rdx)
+define void @int31_int17_pair(i31 %tmp1, i17 %tmp2, i64* %ref.tmp) {
+entry:
+  %t1 = zext i17 %tmp2 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i31 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: int7_int3_pair
+; CHECK: andb $127, %dil
+; CHECK: movb %dil, (%rdx)
+; CHECK: andb $7, %sil
+; CHECK: movb %sil, 1(%rdx)
+define void @int7_int3_pair(i7 signext %tmp1, i3 signext %tmp2, i16* %ref.tmp) {
+entry:
+  %t1 = zext i3 %tmp2 to i16
+  %t2 = shl nuw i16 %t1, 8
+  %t3 = zext i7 %tmp1 to i16
+  %t4 = or i16 %t2, %t3
+  store i16 %t4, i16* %ref.tmp, align 2
+  ret void
+}
+
+; CHECK-LABEL: int24_int24_pair
+; CHECK: movw	%di, (%rdx)
+; CHECK: shrl	$16, %edi
+; CHECK: movb	%dil, 2(%rdx)
+; CHECK: movl	%esi, %eax
+; CHECK: shrl	$16, %eax
+; CHECK: movb	%al, 6(%rdx)
+; CHECK: movw	%si, 4(%rdx)
+define void @int24_int24_pair(i24 signext %tmp1, i24 signext %tmp2, i48* %ref.tmp) {
+entry:
+  %t1 = zext i24 %tmp2 to i48
+  %t2 = shl nuw i48 %t1, 24
+  %t3 = zext i24 %tmp1 to i48
+  %t4 = or i48 %t2, %t3
+  store i48 %t4, i48* %ref.tmp, align 2
+  ret void
+}
+
+; getTypeSizeInBits(i12) != getTypeStoreSizeInBits(i12), so store split doesn't kick in.
+; CHECK-LABEL: int12_int12_pair
+; CHECK: movl	%esi, %eax
+; CHECK: shll	$12, %eax
+; CHECK: andl	$4095, %edi
+; CHECK: orl	%eax, %edi
+; CHECK: shrl	$4, %esi
+; CHECK: movb	%sil, 2(%rdx)
+; CHECK: movw	%di, (%rdx)
+define void @int12_int12_pair(i12 signext %tmp1, i12 signext %tmp2, i24* %ref.tmp) {
+entry:
+  %t1 = zext i12 %tmp2 to i24
+  %t2 = shl nuw i24 %t1, 12
+  %t3 = zext i12 %tmp1 to i24
+  %t4 = or i24 %t2, %t3
+  store i24 %t4, i24* %ref.tmp, align 2
+  ret void
+}
+
+; getTypeSizeInBits(i14) != getTypeStoreSizeInBits(i14), so store split doesn't kick in.
+; CHECK-LABEL: int7_int7_pair
+; CHECK: movzbl	%sil, %eax
+; CHECK: shll	$7, %eax
+; CHECK: andb	$127, %dil
+; CHECK: movzbl	%dil, %ecx
+; CHECK: orl	%eax, %ecx
+; CHECK: andl	$16383, %ecx
+; CHECK: movw	%cx, (%rdx)
+define void @int7_int7_pair(i7 signext %tmp1, i7 signext %tmp2, i14* %ref.tmp) {
+entry:
+  %t1 = zext i7 %tmp2 to i14
+  %t2 = shl nuw i14 %t1, 7
+  %t3 = zext i7 %tmp1 to i14
+  %t4 = or i14 %t2, %t3
+  store i14 %t4, i14* %ref.tmp, align 2
+  ret void
+}
+
+; getTypeSizeInBits(i2) != getTypeStoreSizeInBits(i2), so store split doesn't kick in.
+; CHECK-LABEL: int1_int1_pair
+; CHECK: addb %sil, %sil
+; CHECK: andb $1, %dil
+; CHECK: orb %sil, %dil
+; CHECK: andb $3, %dil
+; CHECK: movb %dil, (%rdx)
+define void @int1_int1_pair(i1 signext %tmp1, i1 signext %tmp2, i2* %ref.tmp) {
+entry:
+  %t1 = zext i1 %tmp2 to i2
+  %t2 = shl nuw i2 %t1, 1
+  %t3 = zext i1 %tmp1 to i2
+  %t4 = or i2 %t2, %t3
+  store i2 %t4, i2* %ref.tmp, align 1
+  ret void
+}
+
+; CHECK-LABEL: mbb_int32_float_pair
+; CHECK: movl %edi, (%rsi)
+; CHECK: movss %xmm0, 4(%rsi)
+define void @mbb_int32_float_pair(i32 %tmp1, float %tmp2, i64* %ref.tmp) {
+entry:
+  %t0 = bitcast float %tmp2 to i32
+  br label %next
+next:
+  %t1 = zext i32 %t0 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i32 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: mbb_int32_float_multi_stores
+; CHECK: movl %edi, (%rsi)
+; CHECK: movss %xmm0, 4(%rsi)
+; CHECK: # %bb2
+; CHECK: movl %edi, (%rdx)
+; CHECK: movss %xmm0, 4(%rdx)
+define void @mbb_int32_float_multi_stores(i32 %tmp1, float %tmp2, i64* %ref.tmp, i64* %ref.tmp1, i1 %cmp) {
+entry:
+  %t0 = bitcast float %tmp2 to i32
+  br label %bb1
+bb1:
+  %t1 = zext i32 %t0 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i32 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  br i1 %cmp, label %bb2, label %exitbb
+bb2:
+  store i64 %t4, i64* %ref.tmp1, align 8
+  br label %exitbb
+exitbb:
+  ret void
+}