[llvm] r248420 - Swap loop invariant GEP with loop variant GEP to allow more LICM.

Wed Sep 23 12:25:31 PDT 2015

Author: lawrence
Date: Wed Sep 23 14:25:30 2015
New Revision: 248420

URL: http://llvm.org/viewvc/llvm-project?rev=248420&view=rev
Log:

    Swap loop invariant GEP with loop variant GEP to allow more LICM.

    This patch changes the order of GEPs generated by Splitting GEPs
    pass, specially when one of the GEPs has constant and the base is
    loop invariant, then we will generate the GEP with constant first
    when beneficial, to expose more cases for LICM.

    If originally Splitting GEP generate the following:
      do.body.i:
        %idxprom.i = sext i32 %shr.i to i64
        %2 = bitcast %typeD* %s to i8*
        %3 = shl i64 %idxprom.i, 2
        %uglygep = getelementptr i8, i8* %2, i64 %3
        %uglygep7 = getelementptr i8, i8* %uglygep, i64 1032
      ...
    Now it genereates:
      do.body.i:
        %idxprom.i = sext i32 %shr.i to i64
        %2 = bitcast %typeD* %s to i8*
        %3 = shl i64 %idxprom.i, 2
        %uglygep = getelementptr i8, i8* %2, i64 1032
        %uglygep7 = getelementptr i8, i8* %uglygep, i64 %3
      ...

    For no-loop cases, the original way of generating GEPs seems to
    expose more CSE cases, so we don't change the logic for no-loop
    cases, and only limit our change to the specific case we are
    interested in.

Added:
    llvm/trunk/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll
Modified:
    llvm/trunk/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp

Modified: llvm/trunk/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp?rev=248420&r1=248419&r2=248420&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp Wed Sep 23 14:25:30 2015
@@ -157,6 +157,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
@@ -324,7 +327,9 @@ public:
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
     AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 
   bool doInitialization(Module &M) override {
@@ -395,10 +400,20 @@ private:
   /// Verify F is free of dead code.
   void verifyNoDeadCode(Function &F);
 
+  bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
+  // Swap the index operand of two GEP.
+  void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
+  // Check if it is safe to swap operand of two GEP.
+  bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
+                            Loop *CurLoop);
+
   const DataLayout *DL;
   DominatorTree *DT;
   ScalarEvolution *SE;
   const TargetMachine *TM;
+
+  LoopInfo *LI;
+  TargetLibraryInfo *TLI;
   /// Whether to lower a GEP with multiple indices into arithmetic operations or
   /// multiple GEPs with a single index.
   bool LowerGEP;
@@ -414,6 +429,8 @@ INITIALIZE_PASS_BEGIN(
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(
     SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
     "Split GEPs to a variadic base and a constant offset for better CSE", false,
@@ -756,6 +773,13 @@ void SeparateConstOffsetFromGEP::lowerTo
   Type *I8PtrTy =
       Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
   Value *ResultPtr = Variadic->getOperand(0);
+  Loop *L = LI->getLoopFor(Variadic->getParent());
+  // Check if the base is not loop invariant or used more than once.
+  bool isSwapCandidate =
+      L && L->isLoopInvariant(ResultPtr) &&
+      !hasMoreThanOneUseInLoop(ResultPtr, L);
+  Value *FirstResult = nullptr;
+
   if (ResultPtr->getType() != I8PtrTy)
     ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
 
@@ -784,6 +808,8 @@ void SeparateConstOffsetFromGEP::lowerTo
       // Create an ugly GEP with a single index for each index.
       ResultPtr =
           Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
+      if (FirstResult == nullptr)
+        FirstResult = ResultPtr;
     }
   }
 
@@ -792,7 +818,17 @@ void SeparateConstOffsetFromGEP::lowerTo
     Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
     ResultPtr =
         Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
-  }
+  } else
+    isSwapCandidate = false;
+
+  // If we created a GEP with constant index, and the base is loop invariant,
+  // then we swap the first one with it, so LICM can move constant GEP out
+  // later.
+  GetElementPtrInst *FirstGEP = dyn_cast<GetElementPtrInst>(FirstResult);
+  GetElementPtrInst *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr);
+  if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
+    swapGEPOperand(FirstGEP, SecondGEP);
+
   if (ResultPtr->getType() != Variadic->getType())
     ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
 
@@ -1036,16 +1072,15 @@ bool SeparateConstOffsetFromGEP::runOnFu
 
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   bool Changed = false;
   for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
-    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) {
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) {
+    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE;)
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))
         Changed |= splitGEP(GEP);
-      }
-      // No need to split GEP ConstantExprs because all its indices are constant
-      // already.
-    }
+    // No need to split GEP ConstantExprs because all its indices are constant
+    // already.
   }
 
   Changed |= reuniteExts(F);
@@ -1138,3 +1173,93 @@ void SeparateConstOffsetFromGEP::verifyN
     }
   }
 }
+
+bool SeparateConstOffsetFromGEP::isLegalToSwapOperand(
+    GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) {
+  if (!FirstGEP || !FirstGEP->hasOneUse())
+    return false;
+
+  if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent())
+    return false;
+
+  if (FirstGEP == SecondGEP)
+    return false;
+
+  unsigned FirstNum = FirstGEP->getNumOperands();
+  unsigned SecondNum = SecondGEP->getNumOperands();
+  // Give up if the number of operands are not 2.
+  if (FirstNum != SecondNum || FirstNum != 2)
+    return false;
+
+  Value *FirstBase = FirstGEP->getOperand(0);
+  Value *SecondBase = SecondGEP->getOperand(0);
+  Value *FirstOffset = FirstGEP->getOperand(1);
+  // Give up if the index of the first GEP is loop invariant.
+  if (CurLoop->isLoopInvariant(FirstOffset))
+    return false;
+
+  // Give up if base doesn't have same type.
+  if (FirstBase->getType() != SecondBase->getType())
+    return false;
+
+  Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset);
+
+  // Check if the second operand of first GEP has constant coefficient.
+  // For an example, for the following code,  we won't gain anything by
+  // hoisting the second GEP out because the second GEP can be folded away.
+  //   %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256
+  //   %67 = shl i64 %scevgep.sum.ur159, 2
+  //   %uglygep160 = getelementptr i8* %65, i64 %67
+  //   %uglygep161 = getelementptr i8* %uglygep160, i64 -1024
+
+  // Skip constant shift instruction which may be generated by Splitting GEPs.
+  if (FirstOffsetDef && FirstOffsetDef->isShift() &&
+      dyn_cast<ConstantInt>(FirstOffsetDef->getOperand(1)))
+    FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0));
+
+  // Give up if FirstOffsetDef is an Add or Sub with constant.
+  // Because it may not profitable at all due to constant folding.
+  if (FirstOffsetDef)
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) {
+      unsigned opc = BO->getOpcode();
+      if ((opc == Instruction::Add || opc == Instruction::Sub) &&
+          (dyn_cast<ConstantInt>(BO->getOperand(0)) ||
+           dyn_cast<ConstantInt>(BO->getOperand(1))))
+        return false;
+    }
+  return true;
+}
+
+bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) {
+  int UsesInLoop = 0;
+  for (User *U : V->users()) {
+    if (Instruction *User = dyn_cast<Instruction>(U))
+      if (L->contains(User))
+        if (++UsesInLoop > 1)
+          return true;
+  }
+  return false;
+}
+
+void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
+                                                GetElementPtrInst *Second) {
+  Value *Offset1 = First->getOperand(1);
+  Value *Offset2 = Second->getOperand(1);
+  First->setOperand(1, Offset2);
+  Second->setOperand(1, Offset1);
+
+  // We changed p+o+c to p+c+o, p+c may not be inbound anymore.
+  const DataLayout &DAL = First->getModule()->getDataLayout();
+  APInt Offset(DAL.getPointerSizeInBits(
+                   cast<PointerType>(First->getType())->getAddressSpace()),
+               0);
+  Value *NewBase =
+      First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset);
+  uint64_t ObjectSize;
+  if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) ||
+     Offset.ugt(ObjectSize)) {
+    First->setIsInBounds(false);
+    Second->setIsInBounds(false);
+  } else
+    First->setIsInBounds(true);
+}

Added: llvm/trunk/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll?rev=248420&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll (added)
+++ llvm/trunk/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll Wed Sep 23 14:25:30 2015
@@ -0,0 +1,50 @@
+; RUN: llc -O3 -aarch64-gep-opt=true  -print-after=codegenprepare -mcpu=cortex-a53 < %s >%t 2>&1 && FileCheck <%t %s
+; REQUIRES: asserts
+target triple = "aarch64--linux-android"
+
+%typeD = type { i32, i32, [256 x i32], [257 x i32] }
+
+; Function Attrs: noreturn nounwind uwtable
+define i32 @test1(%typeD* nocapture %s) {
+entry:
+; CHECK-LABEL: entry:
+; CHECK:    %uglygep = getelementptr i8, i8* %0, i64 1032
+; CHECK:    br label %do.body.i
+
+
+  %tPos = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 0
+  %k0 = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 1
+  %.pre = load i32, i32* %tPos, align 4
+  br label %do.body.i
+
+do.body.i:
+; CHECK-LABEL: do.body.i:
+; CHECK:          %uglygep2 = getelementptr i8, i8* %uglygep, i64 %3
+; CHECK-NEXT:     %4 = bitcast i8* %uglygep2 to i32*
+; CHECK-NOT:      %uglygep2 = getelementptr i8, i8* %uglygep, i64 1032
+
+
+  %0 = phi i32 [ 256, %entry ], [ %.be, %do.body.i.backedge ]
+  %1 = phi i32 [ 0, %entry ], [ %.be6, %do.body.i.backedge ]
+  %add.i = add nsw i32 %1, %0
+  %shr.i = ashr i32 %add.i, 1
+  %idxprom.i = sext i32 %shr.i to i64
+  %arrayidx.i = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 3, i64 %idxprom.i
+  %2 = load i32, i32* %arrayidx.i, align 4
+  %cmp.i = icmp sle i32 %2, %.pre
+  %na.1.i = select i1 %cmp.i, i32 %0, i32 %shr.i
+  %nb.1.i = select i1 %cmp.i, i32 %shr.i, i32 %1
+  %sub.i = sub nsw i32 %na.1.i, %nb.1.i
+  %cmp1.i = icmp eq i32 %sub.i, 1
+  br i1 %cmp1.i, label %fooo.exit, label %do.body.i.backedge
+
+do.body.i.backedge:
+  %.be = phi i32 [ %na.1.i, %do.body.i ], [ 256, %fooo.exit ]
+  %.be6 = phi i32 [ %nb.1.i, %do.body.i ], [ 0, %fooo.exit ]
+  br label %do.body.i
+
+fooo.exit:                              ; preds = %do.body.i
+  store i32 %nb.1.i, i32* %k0, align 4
+  br label %do.body.i.backedge
+}
+