[llvm] 19cdd19 - [AArch64][GlobalISel] Add heuristics for localizing G_CONSTANT.

Amara Emerson via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 27 10:51:23 PDT 2022


Author: Amara Emerson
Date: 2022-07-27T10:51:16-07:00
New Revision: 19cdd1908b173a0a4bdf29a18c689729fc3497ac

URL: https://github.com/llvm/llvm-project/commit/19cdd1908b173a0a4bdf29a18c689729fc3497ac
DIFF: https://github.com/llvm/llvm-project/commit/19cdd1908b173a0a4bdf29a18c689729fc3497ac.diff

LOG: [AArch64][GlobalISel] Add heuristics for localizing G_CONSTANT.

This adds similar heuristics to G_GLOBAL_VALUE, querying the cost of
materializing a specific constant in code size. Doing so prevents us from
sinking constants which require multiple instructions to generate into
use blocks.

Code size savings on CTMark -Os:
Program                                       size.__text
                                              before         after           diff
ClamAV/clamscan                               381940.00      382052.00       0.0%
lencod/lencod                                 428408.00      428428.00       0.0%
SPASS/SPASS                                   411868.00      411876.00       0.0%
kimwitu++/kc                                  449944.00      449944.00       0.0%
Bullet/bullet                                 463588.00      463556.00      -0.0%
sqlite3/sqlite3                               284696.00      284668.00      -0.0%
consumer-typeset/consumer-typeset             414492.00      414424.00      -0.0%
7zip/7zip-benchmark                           595244.00      594972.00      -0.0%
mafft/pairlocalalign                          247512.00      247368.00      -0.1%
tramp3d-v4/tramp3d-v4                         372884.00      372044.00      -0.2%
                           Geomean difference                               -0.0%

Differential Revision: https://reviews.llvm.org/D130554

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/MachineRegisterInfo.h
    llvm/lib/CodeGen/MachineRegisterInfo.cpp
    llvm/lib/CodeGen/TargetLoweringBase.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index b2c5f12106af0..a51f1c753cd02 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -584,6 +584,11 @@ class MachineRegisterInfo {
   /// multiple uses.
   bool hasOneNonDBGUser(Register RegNo) const;
 
+
+  /// hasAtMostUses - Return true if the given register has at most \p MaxUsers
+  /// non-debug user instructions.
+  bool hasAtMostUserInstrs(Register Reg, unsigned MaxUsers) const;
+
   /// replaceRegWith - Replace all instances of FromReg with ToReg in the
   /// machine function.  This is like llvm-level X->replaceAllUsesWith(Y),
   /// except that it also changes any definitions of the register as well.

diff  --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 511bb80052c29..dc109cd3aecab 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -420,6 +420,16 @@ bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const {
   return hasSingleElement(use_nodbg_instructions(RegNo));
 }
 
+bool MachineRegisterInfo::hasAtMostUserInstrs(Register Reg,
+                                              unsigned MaxUsers) const {
+  unsigned NumUsers = 0;
+  auto UI = use_instr_nodbg_begin(Reg), UE = use_instr_nodbg_end();
+  for (; UI != UE && NumUsers < MaxUsers; ++UI)
+    NumUsers++;
+  // If we haven't reached the end yet then there are more than MaxUses users.
+  return UI == UE;
+}
+
 /// clearKillFlags - Iterate over all the uses of the given register and
 /// clear the kill flag from the MachineOperand. This function is used by
 /// optimization passes which extend register lifetimes and need only

diff  --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index a342a4dd1e258..517e835c385e4 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2335,18 +2335,6 @@ bool TargetLoweringBase::shouldLocalize(const MachineInstr &MI,
     llvm_unreachable("Unexpected remat cost");
   };
 
-  // Helper to walk through uses and terminate if we've reached a limit. Saves
-  // us spending time traversing uses if all we want to know is if it's >= min.
-  auto isUsesAtMost = [&](unsigned Reg, unsigned MaxUses) {
-    unsigned NumUses = 0;
-    auto UI = MRI.use_instr_nodbg_begin(Reg), UE = MRI.use_instr_nodbg_end();
-    for (; UI != UE && NumUses < MaxUses; ++UI) {
-      NumUses++;
-    }
-    // If we haven't reached the end yet then there are more than MaxUses users.
-    return UI == UE;
-  };
-
   switch (MI.getOpcode()) {
   default:
     return false;
@@ -2363,8 +2351,7 @@ bool TargetLoweringBase::shouldLocalize(const MachineInstr &MI,
     unsigned MaxUses = maxUses(RematCost);
     if (MaxUses == UINT_MAX)
       return true; // Remats are "free" so always localize.
-    bool B = isUsesAtMost(Reg, MaxUses);
-    return B;
+    return MRI.hasAtMostUserInstrs(Reg, MaxUses);
   }
   }
 }

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 72f0fc94940ca..302556908513e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ObjCARCUtil.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
@@ -75,6 +76,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/InstructionCost.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
@@ -20789,6 +20791,21 @@ bool AArch64TargetLowering::needsFixedCatchObjects() const {
 
 bool AArch64TargetLowering::shouldLocalize(
     const MachineInstr &MI, const TargetTransformInfo *TTI) const {
+  auto &MF = *MI.getMF();
+  auto &MRI = MF.getRegInfo();
+  auto maxUses = [](unsigned RematCost) {
+    // A cost of 1 means remats are basically free.
+    if (RematCost == 1)
+      return UINT_MAX;
+    if (RematCost == 2)
+      return 2U;
+
+    // Remat is too expensive, only sink if there's one user.
+    if (RematCost > 2)
+      return 1U;
+    llvm_unreachable("Unexpected remat cost");
+  };
+
   switch (MI.getOpcode()) {
   case TargetOpcode::G_GLOBAL_VALUE: {
     // On Darwin, TLS global vars get selected into function calls, which
@@ -20799,6 +20816,18 @@ bool AArch64TargetLowering::shouldLocalize(
       return false;
     break;
   }
+  case TargetOpcode::G_CONSTANT: {
+    auto *CI = MI.getOperand(1).getCImm();
+    APInt Imm = CI->getValue();
+    InstructionCost Cost = TTI->getIntImmCost(
+        Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize);
+    assert(Cost.isValid() && "Expected a valid imm cost");
+
+    unsigned RematCost = *Cost.getValue();
+    Register Reg = MI.getOperand(0).getReg();
+    unsigned MaxUses = maxUses(RematCost);
+    return MRI.hasAtMostUserInstrs(Reg, MaxUses);
+  }
   // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
   // localizable.
   case AArch64::ADRP:

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll b/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll
index 1ecc106265959..a4638d87a08c8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll
@@ -114,3 +114,119 @@ if.end:
   ret i32 0
 }
 
+define i32 @imm_cost_too_large_cost_of_2() {
+  ; CHECK-LABEL: name: imm_cost_too_large_cost_of_2
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 -2228259
+  ; CHECK-NEXT:   [[GV:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2
+  ; CHECK-NEXT:   [[GV1:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[GV2:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:gpr(s32) = G_LOAD [[GV2]](p0) :: (dereferenceable load (s32) from @var1)
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:gpr(s32) = G_ICMP intpred(ne), [[LOAD]](s32), [[C2]]
+  ; CHECK-NEXT:   [[AND:%[0-9]+]]:gpr(s32) = G_AND [[ICMP]], [[C2]]
+  ; CHECK-NEXT:   G_BRCOND [[AND]](s32), %bb.4
+  ; CHECK-NEXT:   G_BR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.if.then:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[GV3:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2
+  ; CHECK-NEXT:   G_STORE [[C]](s32), [[GV3]](p0) :: (store (s32) into @var2)
+  ; CHECK-NEXT:   G_BR %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.if.then2:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[GV4:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1
+  ; CHECK-NEXT:   G_STORE [[C]](s32), [[GV4]](p0) :: (store (s32) into @var1)
+  ; CHECK-NEXT:   G_BR %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.if.end:
+  ; CHECK-NEXT:   [[GV5:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3
+  ; CHECK-NEXT:   G_STORE [[C]](s32), [[GV5]](p0) :: (store (s32) into @var3)
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   $w0 = COPY [[C3]](s32)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+entry:
+  %0 = load i32, i32* @var1, align 4
+  %cst1 = bitcast i32 -2228259 to i32
+  %cmp = icmp eq i32 %0, 1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 %cst1, i32* @var2
+  br label %if.then2
+
+if.then2:
+  store i32 %cst1, i32* @var1
+  br label %if.end
+
+if.end:
+  store i32 %cst1, i32* @var3
+  ret i32 0
+}
+
+define i64 @imm_cost_too_large_cost_of_4() {
+  ; CHECK-LABEL: name: imm_cost_too_large_cost_of_4
+  ; CHECK: bb.1.entry:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 -2228259
+  ; CHECK-NEXT:   [[GV:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2_64
+  ; CHECK-NEXT:   [[GV1:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3_64
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   [[GV2:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1_64
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:gpr(s64) = G_LOAD [[GV2]](p0) :: (dereferenceable load (s64) from @var1_64, align 4)
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 1
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:gpr(s32) = G_ICMP intpred(ne), [[LOAD]](s64), [[C2]]
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[AND:%[0-9]+]]:gpr(s32) = G_AND [[ICMP]], [[C3]]
+  ; CHECK-NEXT:   G_BRCOND [[AND]](s32), %bb.4
+  ; CHECK-NEXT:   G_BR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.if.then:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[GV3:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2_64
+  ; CHECK-NEXT:   G_STORE [[C]](s64), [[GV3]](p0) :: (store (s64) into @var2_64)
+  ; CHECK-NEXT:   G_BR %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.if.then2:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[GV4:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1_64
+  ; CHECK-NEXT:   G_STORE [[C]](s64), [[GV4]](p0) :: (store (s64) into @var1_64)
+  ; CHECK-NEXT:   G_BR %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.if.end:
+  ; CHECK-NEXT:   [[GV5:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3_64
+  ; CHECK-NEXT:   G_STORE [[C]](s64), [[GV5]](p0) :: (store (s64) into @var3_64)
+  ; CHECK-NEXT:   [[C4:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   $x0 = COPY [[C4]](s64)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $x0
+entry:
+  %0 = load i64, i64* @var1_64, align 4
+  %cst1 = bitcast i64 -2228259 to i64
+  %cmp = icmp eq i64 %0, 1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i64 %cst1, i64* @var2_64
+  br label %if.then2
+
+if.then2:
+  store i64 %cst1, i64* @var1_64
+  br label %if.end
+
+if.end:
+  store i64 %cst1, i64* @var3_64
+  ret i64 0
+}
+
+ at var1_64 = common global i64 0, align 4
+ at var2_64 = common global i64 0, align 4
+ at var3_64 = common global i64 0, align 4


        


More information about the llvm-commits mailing list