[llvm] 434b0ba - [AggressiveInstCombine] folding load for constant global patterened arrays and structs by alignment

Thu Mar 23 07:31:57 PDT 2023

Author: khei4
Date: 2023-03-23T23:31:22+09:00
New Revision: 434b0badb5d53138490a075dd945df7480649154

URL: https://github.com/llvm/llvm-project/commit/434b0badb5d53138490a075dd945df7480649154
DIFF: https://github.com/llvm/llvm-project/commit/434b0badb5d53138490a075dd945df7480649154.diff

LOG: [AggressiveInstCombine] folding load for constant global patterened arrays and structs by alignment
Differential Revision: https://reviews.llvm.org/D144445
Reviewed By: nikic

fix: wrong arrow

Added: 
    

Modified: 
    llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
    llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 473b41241b8a6..cf652836bef25 100644

--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -305,7 +306,7 @@ static bool tryToRecognizePopCount(Instruction &I) {
   Value *MulOp0;
   // Matching "(i * 0x01010101...) >> 24".
   if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) &&
-       match(Op1, m_SpecificInt(MaskShift))) {
+      match(Op1, m_SpecificInt(MaskShift))) {
     Value *ShiftOp0;
     // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)".
     if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)),
@@ -401,8 +402,8 @@ static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) {
 /// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids
 /// pessimistic codegen that has to account for setting errno and can enable
 /// vectorization.
-static bool
-foldSqrt(Instruction &I, TargetTransformInfo &TTI, TargetLibraryInfo &TLI) {
+static bool foldSqrt(Instruction &I, TargetTransformInfo &TTI,
+                     TargetLibraryInfo &TLI) {
   // Match a call to sqrt mathlib function.
   auto *Call = dyn_cast<CallInst>(&I);
   if (!Call)
@@ -824,6 +825,58 @@ static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL,
   return true;
 }
 
+/// If C is a constant patterned array and all valid loaded results for given
+/// alignment are same to a constant, return that constant.
+static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
+  auto *LI = dyn_cast<LoadInst>(&I);
+  if (!LI || LI->isVolatile())
+    return false;
+
+  // We can only fold the load if it is from a constant global with definitive
+  // initializer. Skip expensive logic if this is not the case.
+  auto *PtrOp = LI->getPointerOperand();
+  auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(PtrOp));
+  if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
+    return false;
+
+  Type *LoadTy = LI->getType();
+  Constant *C = GV->getInitializer();
+
+  // Bail for large initializers in excess of 4K to avoid too many scans.
+  uint64_t GVSize = DL.getTypeAllocSize(C->getType());
+  if (!GVSize || 4096 < GVSize)
+    return false;
+
+  // Check whether pointer arrives back at Global Variable.
+  // If PtrOp is neither GlobalVariable nor GEP, it might not arrive back at
+  // GlobalVariable.
+  // TODO: implement GEP handling
+  unsigned BW = DL.getIndexTypeSizeInBits(PtrOp->getType());
+  // TODO: Determine stride based on GEPs.
+  APInt Stride(BW, 1);
+  APInt ConstOffset(BW, 0);
+
+  // Any possible offset could be multiple of GEP stride. And any valid
+  // offset is multiple of load alignment, so checking only multiples of bigger
+  // one is sufficient to say results' equality.
+  if (auto LA = LI->getAlign();
+      LA <= GV->getAlign().valueOrOne() && Stride.getZExtValue() < LA.value())
+    Stride = APInt(BW, LA.value());
+
+  Constant *Ca = ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL);
+  if (!Ca)
+    return false;
+
+  unsigned E = GVSize - DL.getTypeStoreSize(LoadTy);
+  for (; ConstOffset.getZExtValue() <= E; ConstOffset += Stride)
+    if (Ca != ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL))
+      return false;
+
+  I.replaceAllUsesWith(Ca);
+
+  return true;
+}
+
 /// This is the entry point for folds that could be implemented in regular
 /// InstCombine, but they are separated because they are not expected to
 /// occur frequently and/or have more than a constant-length pattern match.
@@ -850,6 +903,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
       MadeChange |= tryToFPToSat(I, TTI);
       MadeChange |= tryToRecognizeTableBasedCttz(I);
       MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA);
+      MadeChange |= foldPatternedLoads(I, DL);
       // NOTE: This function introduces erasing of the instruction `I`, so it
       // needs to be called at the end of this sequence, otherwise we may make
       // bugs.

diff  --git a/llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll b/llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll
index 5410a21e3211d..7acc6109744ca 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/patterned-load.ll
@@ -12,12 +12,9 @@
 @constpackedstruct = internal constant <{[8 x i8]}>  <{[8 x i8] c"\01\00\01\00\01\00\01\00"}>, align 4
 @conststruct = internal constant {i16, [8 x i8]}  {i16 1, [8 x i8] c"\01\00\01\00\01\00\01\00"}, align 4
 
-; TODO: this will be ret i8 1
 define i8 @inbounds_gep_load_i8_align2(i64 %idx){
 ; CHECK-LABEL: @inbounds_gep_load_i8_align2(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr @constarray1, i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 2
-; CHECK-NEXT:    ret i8 [[TMP2]]
+; CHECK-NEXT:    ret i8 1
 ;
   %1 = getelementptr inbounds i8, ptr @constarray1, i64 %idx
   %2 = load i8, ptr %1, align 2
@@ -53,10 +50,7 @@ declare ptr @llvm.ptrmask.p0.i64(ptr , i64)
 ; can't be folded because ptrmask can change ptr, while preserving provenance
 define i8 @inbounds_gep_load_i8_align2_ptrmasked(i64 %idx, i64 %mask){
 ; CHECK-LABEL: @inbounds_gep_load_i8_align2_ptrmasked(
-; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr @constarray1, i64 [[MASK:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 2
-; CHECK-NEXT:    ret i8 [[TMP3]]
+; CHECK-NEXT:    ret i8 1
 ;
   %1 = call ptr @llvm.ptrmask.p0.i64(ptr @constarray1, i64 %mask)
   %2 = getelementptr inbounds i8, ptr %1, i64 %idx
@@ -102,13 +96,12 @@ define i32 @inbounds_gep_i32_load_i32_const_offset(i64 %idx){
   ret i32 %3
 }
 
-; TODO: this coould be folded into 65537(LE), 16777472(BE)
 define i32 @gep_load_i32_align2_const_offset(i64 %idx){
-; CHECK-LABEL: @gep_load_i32_align2_const_offset(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr @constarray1, i64 -2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [3 x i16], ptr [[TMP1]], i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 2
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; LE-LABEL: @gep_load_i32_align2_const_offset(
+; LE-NEXT:    ret i32 65537
+;
+; BE-LABEL: @gep_load_i32_align2_const_offset(
+; BE-NEXT:    ret i32 16777472
 ;
   %1 = getelementptr i16, ptr @constarray1, i64 -2
   %2 = getelementptr [3 x i16], ptr %1, i64 %idx
@@ -146,12 +139,12 @@ define i32 @inbounds_gep_i32_load_i32_const_ptr_array(i64 %idx){
   ret i32 %3
 }
 
-; TODO: this coould be folded into 65537(LE), 16777472(BE)
 define i32 @inbounds_gep_i32_load_i32_align4_packedstruct(i64 %idx){
-; CHECK-LABEL: @inbounds_gep_i32_load_i32_align4_packedstruct(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr @constpackedstruct, i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; LE-LABEL: @inbounds_gep_i32_load_i32_align4_packedstruct(
+; LE-NEXT:    ret i32 65537
+;
+; BE-LABEL: @inbounds_gep_i32_load_i32_align4_packedstruct(
+; BE-NEXT:    ret i32 16777472
 ;
   %1 = getelementptr inbounds i32, ptr @constpackedstruct, i64 %idx
   %2 = load i32, ptr %1, align 4
@@ -172,11 +165,14 @@ define i32 @inbounds_gep_i8_load_i32_align1_packedstruct(i64 %idx){
 
 ; TODO: this coould be folded into 65537(LE), 16777472(BE)
 define i32 @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(i64 %idx){
-; CHECK-LABEL: @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr @conststruct, i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDX:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; LE-LABEL: @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(
+; LE-NEXT:    ret i32 65537
+;
+; BE-LABEL: @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(
+; BE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr @conststruct, i64 1
+; BE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDX:%.*]]
+; BE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+; BE-NEXT:    ret i32 [[TMP3]]
 ;
   %1 = getelementptr inbounds i16, ptr @conststruct, i64 1
   %2 = getelementptr inbounds i32, ptr %1, i64 %idx
@@ -184,6 +180,3 @@ define i32 @inbounds_gep_i32_load_i32_align4_struct_with_const_offset(i64 %idx){
   ret i32 %3
 }
 
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; BE: {{.*}}
-; LE: {{.*}}