[llvm] cbc50ba - [AggressiveInstCombine] Handle the nested GEP/BitCast scenario in Load Merge.

Wed May 24 02:36:43 PDT 2023

Author: bipmis
Date: 2023-05-24T10:36:11+01:00
New Revision: cbc50ba12e61a3cc311908bfdf532d37f917ccf4

URL: https://github.com/llvm/llvm-project/commit/cbc50ba12e61a3cc311908bfdf532d37f917ccf4
DIFF: https://github.com/llvm/llvm-project/commit/cbc50ba12e61a3cc311908bfdf532d37f917ccf4.diff

LOG: [AggressiveInstCombine] Handle the nested GEP/BitCast scenario in Load Merge.

This seems to be an issue currently where there are nested/chained GEP/BitCast Pointers.
The patch generates a new GEP for the wider load to avoid dominance problems.

Differential Revision: https://reviews.llvm.org/D150864

Added: 
    

Modified: 
    llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
    llvm/test/Transforms/AggressiveInstCombine/AArch64/or-load.ll
    llvm/test/Transforms/AggressiveInstCombine/X86/or-load.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index f7b114b53a843..b2925b1a9f2f3 100644

--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -766,7 +766,8 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
 // pattern which suggests that the loads can be combined. The one and only use
 // of the loads is to form a wider load.
 static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL,
-                                 TargetTransformInfo &TTI, AliasAnalysis &AA) {
+                                 TargetTransformInfo &TTI, AliasAnalysis &AA,
+                                 const DominatorTree &DT) {
   // Only consider load chains of scalar values.
   if (isa<VectorType>(I.getType()))
     return false;
@@ -791,15 +792,17 @@ static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL,
   if (!Allowed || !Fast)
     return false;
 
-  // Make sure the Load pointer of type GEP/non-GEP is above insert point
-  Instruction *Inst = dyn_cast<Instruction>(LI1->getPointerOperand());
-  if (Inst && Inst->getParent() == LI1->getParent() &&
-      !Inst->comesBefore(LOps.RootInsert))
-    Inst->moveBefore(LOps.RootInsert);
-
-  // New load can be generated
+  // Get the Index and Ptr for the new GEP.
   Value *Load1Ptr = LI1->getPointerOperand();
   Builder.SetInsertPoint(LOps.RootInsert);
+  if (!DT.dominates(Load1Ptr, LOps.RootInsert)) {
+    APInt Offset1(DL.getIndexTypeSizeInBits(Load1Ptr->getType()), 0);
+    Load1Ptr = Load1Ptr->stripAndAccumulateConstantOffsets(
+        DL, Offset1, /* AllowNonInbounds */ true);
+    Load1Ptr = Builder.CreateGEP(Builder.getInt8Ty(), Load1Ptr,
+                                 Builder.getInt32(Offset1.getZExtValue()));
+  }
+  // Generate wider load.
   Value *NewPtr = Builder.CreateBitCast(Load1Ptr, WiderType->getPointerTo(AS));
   NewLoad = Builder.CreateAlignedLoad(WiderType, NewPtr, LI1->getAlign(),
                                       LI1->isVolatile(), "");
@@ -936,7 +939,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
       MadeChange |= tryToRecognizePopCount(I);
       MadeChange |= tryToFPToSat(I, TTI);
       MadeChange |= tryToRecognizeTableBasedCttz(I);
-      MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA);
+      MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
       MadeChange |= foldPatternedLoads(I, DL);
       // NOTE: This function introduces erasing of the instruction `I`, so it
       // needs to be called at the end of this sequence, otherwise we may make

diff  --git a/llvm/test/Transforms/AggressiveInstCombine/AArch64/or-load.ll b/llvm/test/Transforms/AggressiveInstCombine/AArch64/or-load.ll
index 8087137c01ee0..fe0a1bcd3870d 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/AArch64/or-load.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/AArch64/or-load.ll
@@ -1869,8 +1869,8 @@ define i32 @loadCombine_4consecutive_badinsert2(ptr %p) {
 
 define i32 @loadCombine_4consecutive_badinsert3(ptr %p) {
 ; LE-LABEL: @loadCombine_4consecutive_badinsert3(
-; LE-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 1
-; LE-NEXT:    [[L1:%.*]] = load i32, ptr [[P1]], align 1
+; LE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 1
+; LE-NEXT:    [[L1:%.*]] = load i32, ptr [[TMP1]], align 1
 ; LE-NEXT:    ret i32 [[L1]]
 ;
 ; BE-LABEL: @loadCombine_4consecutive_badinsert3(
@@ -2085,3 +2085,82 @@ define i32 @loadCombine_4consecutive_badinsert6(ptr %p) {
   %o3 = or i32 %o2, %e1
   ret i32 %o3
 }
+
+define void @nested_gep(ptr %p, ptr %dest) {
+; LE-LABEL: @nested_gep(
+; LE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 68
+; LE-NEXT:    [[LD2:%.*]] = load i64, ptr [[TMP1]], align 4
+; LE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[LD2]] to i32
+; LE-NEXT:    store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4
+; LE-NEXT:    ret void
+;
+; BE-LABEL: @nested_gep(
+; BE-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 72
+; BE-NEXT:    [[LD1:%.*]] = load i32, ptr [[GEP1]], align 4
+; BE-NEXT:    [[LD1_ZEXT:%.*]] = zext i32 [[LD1]] to i64
+; BE-NEXT:    [[LD1_SHL:%.*]] = shl nuw i64 [[LD1_ZEXT]], 32
+; BE-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 64
+; BE-NEXT:    [[FINAL_PTR:%.*]] = getelementptr inbounds i8, ptr [[GEP2]], i64 4
+; BE-NEXT:    [[LD2:%.*]] = load i32, ptr [[FINAL_PTR]], align 4
+; BE-NEXT:    [[LD2_ZEXT:%.*]] = zext i32 [[LD2]] to i64
+; BE-NEXT:    [[OR:%.*]] = or i64 [[LD1_SHL]], [[LD2_ZEXT]]
+; BE-NEXT:    [[ADD:%.*]] = add i64 [[OR]], 0
+; BE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[ADD]] to i32
+; BE-NEXT:    store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4
+; BE-NEXT:    ret void
+;
+  %gep1 = getelementptr inbounds i8, ptr %p, i64 72
+  %ld1 = load i32, ptr %gep1, align 4
+  %ld1_zext = zext i32 %ld1 to i64
+  %ld1_shl = shl nuw i64 %ld1_zext, 32
+  %gep2 = getelementptr inbounds i8, ptr %p, i64 64
+  ; Don't move final_ptr before gep2
+  %final_ptr = getelementptr inbounds i8, ptr %gep2, i64 4
+  %ld2 = load i32, ptr %final_ptr, align 4
+  %ld2_zext = zext i32 %ld2 to i64
+  %or = or i64 %ld1_shl, %ld2_zext
+  %add = add i64 %or, 0
+  %trunc = trunc i64 %add to i32
+  store i32 %trunc, ptr %dest, align 4
+  ret void
+}
+
+
+define void @bitcast_gep(ptr %p, ptr %dest) {
+; LE-LABEL: @bitcast_gep(
+; LE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 68
+; LE-NEXT:    [[LD2:%.*]] = load i64, ptr [[TMP1]], align 4
+; LE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[LD2]] to i32
+; LE-NEXT:    store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4
+; LE-NEXT:    ret void
+;
+; BE-LABEL: @bitcast_gep(
+; BE-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 72
+; BE-NEXT:    [[LD1:%.*]] = load i32, ptr [[GEP1]], align 4
+; BE-NEXT:    [[LD1_ZEXT:%.*]] = zext i32 [[LD1]] to i64
+; BE-NEXT:    [[LD1_SHL:%.*]] = shl nuw i64 [[LD1_ZEXT]], 32
+; BE-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 68
+; BE-NEXT:    [[FINAL_PTR:%.*]] = bitcast ptr [[GEP2]] to ptr
+; BE-NEXT:    [[LD2:%.*]] = load i32, ptr [[FINAL_PTR]], align 4
+; BE-NEXT:    [[LD2_ZEXT:%.*]] = zext i32 [[LD2]] to i64
+; BE-NEXT:    [[OR:%.*]] = or i64 [[LD1_SHL]], [[LD2_ZEXT]]
+; BE-NEXT:    [[ADD:%.*]] = add i64 [[OR]], 0
+; BE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[ADD]] to i32
+; BE-NEXT:    store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4
+; BE-NEXT:    ret void
+;
+  %gep1 = getelementptr inbounds i8, ptr %p, i64 72
+  %ld1 = load i32, ptr %gep1, align 4
+  %ld1_zext = zext i32 %ld1 to i64
+  %ld1_shl = shl nuw i64 %ld1_zext, 32
+  %gep2 = getelementptr inbounds i8, ptr %p, i64 68
+  ; Don't move final_ptr before gep2
+  %final_ptr = bitcast ptr %gep2 to ptr
+  %ld2 = load i32, ptr %final_ptr, align 4
+  %ld2_zext = zext i32 %ld2 to i64
+  %or = or i64 %ld1_shl, %ld2_zext
+  %add = add i64 %or, 0
+  %trunc = trunc i64 %add to i32
+  store i32 %trunc, ptr %dest, align 4
+  ret void
+}

diff  --git a/llvm/test/Transforms/AggressiveInstCombine/X86/or-load.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/or-load.ll
index 842b1f781eac7..88f92baa63ef0 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/X86/or-load.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/or-load.ll
@@ -2005,8 +2005,8 @@ define i32 @loadCombine_4consecutive_badinsert2(ptr %p) {
 
 define i32 @loadCombine_4consecutive_badinsert3(ptr %p) {
 ; LE-LABEL: @loadCombine_4consecutive_badinsert3(
-; LE-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 1
-; LE-NEXT:    [[L1:%.*]] = load i32, ptr [[P1]], align 1
+; LE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 1
+; LE-NEXT:    [[L1:%.*]] = load i32, ptr [[TMP1]], align 1
 ; LE-NEXT:    ret i32 [[L1]]
 ;
 ; BE-LABEL: @loadCombine_4consecutive_badinsert3(
@@ -2303,3 +2303,82 @@ define i64 @loadCombine_nonConstShift2(ptr %arg, i8 %b) {
   %o7 = or i64 %s1, %s0
   ret i64 %o7
 }
+
+define void @nested_gep(ptr %p, ptr %dest) {
+; LE-LABEL: @nested_gep(
+; LE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 68
+; LE-NEXT:    [[LD2:%.*]] = load i64, ptr [[TMP1]], align 4
+; LE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[LD2]] to i32
+; LE-NEXT:    store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4
+; LE-NEXT:    ret void
+;
+; BE-LABEL: @nested_gep(
+; BE-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 72
+; BE-NEXT:    [[LD1:%.*]] = load i32, ptr [[GEP1]], align 4
+; BE-NEXT:    [[LD1_ZEXT:%.*]] = zext i32 [[LD1]] to i64
+; BE-NEXT:    [[LD1_SHL:%.*]] = shl nuw i64 [[LD1_ZEXT]], 32
+; BE-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 64
+; BE-NEXT:    [[FINAL_PTR:%.*]] = getelementptr inbounds i8, ptr [[GEP2]], i64 4
+; BE-NEXT:    [[LD2:%.*]] = load i32, ptr [[FINAL_PTR]], align 4
+; BE-NEXT:    [[LD2_ZEXT:%.*]] = zext i32 [[LD2]] to i64
+; BE-NEXT:    [[OR:%.*]] = or i64 [[LD1_SHL]], [[LD2_ZEXT]]
+; BE-NEXT:    [[ADD:%.*]] = add i64 [[OR]], 0
+; BE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[ADD]] to i32
+; BE-NEXT:    store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4
+; BE-NEXT:    ret void
+;
+  %gep1 = getelementptr inbounds i8, ptr %p, i64 72
+  %ld1 = load i32, ptr %gep1, align 4
+  %ld1_zext = zext i32 %ld1 to i64
+  %ld1_shl = shl nuw i64 %ld1_zext, 32
+  %gep2 = getelementptr inbounds i8, ptr %p, i64 64
+  ; Don't move final_ptr before gep2
+  %final_ptr = getelementptr inbounds i8, ptr %gep2, i64 4
+  %ld2 = load i32, ptr %final_ptr, align 4
+  %ld2_zext = zext i32 %ld2 to i64
+  %or = or i64 %ld1_shl, %ld2_zext
+  %add = add i64 %or, 0
+  %trunc = trunc i64 %add to i32
+  store i32 %trunc, ptr %dest, align 4
+  ret void
+}
+
+
+define void @bitcast_gep(ptr %p, ptr %dest) {
+; LE-LABEL: @bitcast_gep(
+; LE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 68
+; LE-NEXT:    [[LD2:%.*]] = load i64, ptr [[TMP1]], align 4
+; LE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[LD2]] to i32
+; LE-NEXT:    store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4
+; LE-NEXT:    ret void
+;
+; BE-LABEL: @bitcast_gep(
+; BE-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 72
+; BE-NEXT:    [[LD1:%.*]] = load i32, ptr [[GEP1]], align 4
+; BE-NEXT:    [[LD1_ZEXT:%.*]] = zext i32 [[LD1]] to i64
+; BE-NEXT:    [[LD1_SHL:%.*]] = shl nuw i64 [[LD1_ZEXT]], 32
+; BE-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 68
+; BE-NEXT:    [[FINAL_PTR:%.*]] = bitcast ptr [[GEP2]] to ptr
+; BE-NEXT:    [[LD2:%.*]] = load i32, ptr [[FINAL_PTR]], align 4
+; BE-NEXT:    [[LD2_ZEXT:%.*]] = zext i32 [[LD2]] to i64
+; BE-NEXT:    [[OR:%.*]] = or i64 [[LD1_SHL]], [[LD2_ZEXT]]
+; BE-NEXT:    [[ADD:%.*]] = add i64 [[OR]], 0
+; BE-NEXT:    [[TRUNC:%.*]] = trunc i64 [[ADD]] to i32
+; BE-NEXT:    store i32 [[TRUNC]], ptr [[DEST:%.*]], align 4
+; BE-NEXT:    ret void
+;
+  %gep1 = getelementptr inbounds i8, ptr %p, i64 72
+  %ld1 = load i32, ptr %gep1, align 4
+  %ld1_zext = zext i32 %ld1 to i64
+  %ld1_shl = shl nuw i64 %ld1_zext, 32
+  %gep2 = getelementptr inbounds i8, ptr %p, i64 68
+  ; Don't move final_ptr before gep2
+  %final_ptr = bitcast ptr %gep2 to ptr
+  %ld2 = load i32, ptr %final_ptr, align 4
+  %ld2_zext = zext i32 %ld2 to i64
+  %or = or i64 %ld1_shl, %ld2_zext
+  %add = add i64 %or, 0
+  %trunc = trunc i64 %add to i32
+  store i32 %trunc, ptr %dest, align 4
+  ret void
+}