[llvm] 19cdd19 - [AArch64][GlobalISel] Add heuristics for localizing G_CONSTANT.
Amara Emerson via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 27 10:51:23 PDT 2022
Author: Amara Emerson
Date: 2022-07-27T10:51:16-07:00
New Revision: 19cdd1908b173a0a4bdf29a18c689729fc3497ac
URL: https://github.com/llvm/llvm-project/commit/19cdd1908b173a0a4bdf29a18c689729fc3497ac
DIFF: https://github.com/llvm/llvm-project/commit/19cdd1908b173a0a4bdf29a18c689729fc3497ac.diff
LOG: [AArch64][GlobalISel] Add heuristics for localizing G_CONSTANT.
This adds similar heuristics to G_GLOBAL_VALUE, querying the cost of
materializing a specific constant in code size. Doing so prevents us from
sinking constants which require multiple instructions to generate into
use blocks.
Code size savings on CTMark -Os:
Program size.__text
before after diff
ClamAV/clamscan 381940.00 382052.00 0.0%
lencod/lencod 428408.00 428428.00 0.0%
SPASS/SPASS 411868.00 411876.00 0.0%
kimwitu++/kc 449944.00 449944.00 0.0%
Bullet/bullet 463588.00 463556.00 -0.0%
sqlite3/sqlite3 284696.00 284668.00 -0.0%
consumer-typeset/consumer-typeset 414492.00 414424.00 -0.0%
7zip/7zip-benchmark 595244.00 594972.00 -0.0%
mafft/pairlocalalign 247512.00 247368.00 -0.1%
tramp3d-v4/tramp3d-v4 372884.00 372044.00 -0.2%
Geomean difference -0.0%
Differential Revision: https://reviews.llvm.org/D130554
Added:
Modified:
llvm/include/llvm/CodeGen/MachineRegisterInfo.h
llvm/lib/CodeGen/MachineRegisterInfo.cpp
llvm/lib/CodeGen/TargetLoweringBase.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index b2c5f12106af0..a51f1c753cd02 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -584,6 +584,11 @@ class MachineRegisterInfo {
/// multiple uses.
bool hasOneNonDBGUser(Register RegNo) const;
+
+ /// hasAtMostUses - Return true if the given register has at most \p MaxUsers
+ /// non-debug user instructions.
+ bool hasAtMostUserInstrs(Register Reg, unsigned MaxUsers) const;
+
/// replaceRegWith - Replace all instances of FromReg with ToReg in the
/// machine function. This is like llvm-level X->replaceAllUsesWith(Y),
/// except that it also changes any definitions of the register as well.
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 511bb80052c29..dc109cd3aecab 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -420,6 +420,16 @@ bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const {
return hasSingleElement(use_nodbg_instructions(RegNo));
}
+bool MachineRegisterInfo::hasAtMostUserInstrs(Register Reg,
+ unsigned MaxUsers) const {
+ unsigned NumUsers = 0;
+ auto UI = use_instr_nodbg_begin(Reg), UE = use_instr_nodbg_end();
+ for (; UI != UE && NumUsers < MaxUsers; ++UI)
+ NumUsers++;
+ // If we haven't reached the end yet then there are more than MaxUses users.
+ return UI == UE;
+}
+
/// clearKillFlags - Iterate over all the uses of the given register and
/// clear the kill flag from the MachineOperand. This function is used by
/// optimization passes which extend register lifetimes and need only
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index a342a4dd1e258..517e835c385e4 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2335,18 +2335,6 @@ bool TargetLoweringBase::shouldLocalize(const MachineInstr &MI,
llvm_unreachable("Unexpected remat cost");
};
- // Helper to walk through uses and terminate if we've reached a limit. Saves
- // us spending time traversing uses if all we want to know is if it's >= min.
- auto isUsesAtMost = [&](unsigned Reg, unsigned MaxUses) {
- unsigned NumUses = 0;
- auto UI = MRI.use_instr_nodbg_begin(Reg), UE = MRI.use_instr_nodbg_end();
- for (; UI != UE && NumUses < MaxUses; ++UI) {
- NumUses++;
- }
- // If we haven't reached the end yet then there are more than MaxUses users.
- return UI == UE;
- };
-
switch (MI.getOpcode()) {
default:
return false;
@@ -2363,8 +2351,7 @@ bool TargetLoweringBase::shouldLocalize(const MachineInstr &MI,
unsigned MaxUses = maxUses(RematCost);
if (MaxUses == UINT_MAX)
return true; // Remats are "free" so always localize.
- bool B = isUsesAtMost(Reg, MaxUses);
- return B;
+ return MRI.hasAtMostUserInstrs(Reg, MaxUses);
}
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 72f0fc94940ca..302556908513e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -31,6 +31,7 @@
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/ObjCARCUtil.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/CallingConvLower.h"
@@ -75,6 +76,7 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/InstructionCost.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
@@ -20789,6 +20791,21 @@ bool AArch64TargetLowering::needsFixedCatchObjects() const {
bool AArch64TargetLowering::shouldLocalize(
const MachineInstr &MI, const TargetTransformInfo *TTI) const {
+ auto &MF = *MI.getMF();
+ auto &MRI = MF.getRegInfo();
+ auto maxUses = [](unsigned RematCost) {
+ // A cost of 1 means remats are basically free.
+ if (RematCost == 1)
+ return UINT_MAX;
+ if (RematCost == 2)
+ return 2U;
+
+ // Remat is too expensive, only sink if there's one user.
+ if (RematCost > 2)
+ return 1U;
+ llvm_unreachable("Unexpected remat cost");
+ };
+
switch (MI.getOpcode()) {
case TargetOpcode::G_GLOBAL_VALUE: {
// On Darwin, TLS global vars get selected into function calls, which
@@ -20799,6 +20816,18 @@ bool AArch64TargetLowering::shouldLocalize(
return false;
break;
}
+ case TargetOpcode::G_CONSTANT: {
+ auto *CI = MI.getOperand(1).getCImm();
+ APInt Imm = CI->getValue();
+ InstructionCost Cost = TTI->getIntImmCost(
+ Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize);
+ assert(Cost.isValid() && "Expected a valid imm cost");
+
+ unsigned RematCost = *Cost.getValue();
+ Register Reg = MI.getOperand(0).getReg();
+ unsigned MaxUses = maxUses(RematCost);
+ return MRI.hasAtMostUserInstrs(Reg, MaxUses);
+ }
// If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
// localizable.
case AArch64::ADRP:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll b/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll
index 1ecc106265959..a4638d87a08c8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll
@@ -114,3 +114,119 @@ if.end:
ret i32 0
}
+define i32 @imm_cost_too_large_cost_of_2() {
+ ; CHECK-LABEL: name: imm_cost_too_large_cost_of_2
+ ; CHECK: bb.1.entry:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 -2228259
+ ; CHECK-NEXT: [[GV:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2
+ ; CHECK-NEXT: [[GV1:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[GV2:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(s32) = G_LOAD [[GV2]](p0) :: (dereferenceable load (s32) from @var1)
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:gpr(s32) = G_ICMP intpred(ne), [[LOAD]](s32), [[C2]]
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:gpr(s32) = G_AND [[ICMP]], [[C2]]
+ ; CHECK-NEXT: G_BRCOND [[AND]](s32), %bb.4
+ ; CHECK-NEXT: G_BR %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.if.then:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[GV3:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[GV3]](p0) :: (store (s32) into @var2)
+ ; CHECK-NEXT: G_BR %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3.if.then2:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[GV4:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[GV4]](p0) :: (store (s32) into @var1)
+ ; CHECK-NEXT: G_BR %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4.if.end:
+ ; CHECK-NEXT: [[GV5:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[GV5]](p0) :: (store (s32) into @var3)
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: $w0 = COPY [[C3]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+entry:
+ %0 = load i32, i32* @var1, align 4
+ %cst1 = bitcast i32 -2228259 to i32
+ %cmp = icmp eq i32 %0, 1
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ store i32 %cst1, i32* @var2
+ br label %if.then2
+
+if.then2:
+ store i32 %cst1, i32* @var1
+ br label %if.end
+
+if.end:
+ store i32 %cst1, i32* @var3
+ ret i32 0
+}
+
+define i64 @imm_cost_too_large_cost_of_4() {
+ ; CHECK-LABEL: name: imm_cost_too_large_cost_of_4
+ ; CHECK: bb.1.entry:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[C:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 -2228259
+ ; CHECK-NEXT: [[GV:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2_64
+ ; CHECK-NEXT: [[GV1:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3_64
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[GV2:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1_64
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(s64) = G_LOAD [[GV2]](p0) :: (dereferenceable load (s64) from @var1_64, align 4)
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:gpr(s32) = G_ICMP intpred(ne), [[LOAD]](s64), [[C2]]
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:gpr(s32) = G_AND [[ICMP]], [[C3]]
+ ; CHECK-NEXT: G_BRCOND [[AND]](s32), %bb.4
+ ; CHECK-NEXT: G_BR %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.if.then:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[GV3:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2_64
+ ; CHECK-NEXT: G_STORE [[C]](s64), [[GV3]](p0) :: (store (s64) into @var2_64)
+ ; CHECK-NEXT: G_BR %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3.if.then2:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[GV4:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1_64
+ ; CHECK-NEXT: G_STORE [[C]](s64), [[GV4]](p0) :: (store (s64) into @var1_64)
+ ; CHECK-NEXT: G_BR %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4.if.end:
+ ; CHECK-NEXT: [[GV5:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3_64
+ ; CHECK-NEXT: G_STORE [[C]](s64), [[GV5]](p0) :: (store (s64) into @var3_64)
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: $x0 = COPY [[C4]](s64)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+entry:
+ %0 = load i64, i64* @var1_64, align 4
+ %cst1 = bitcast i64 -2228259 to i64
+ %cmp = icmp eq i64 %0, 1
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ store i64 %cst1, i64* @var2_64
+ br label %if.then2
+
+if.then2:
+ store i64 %cst1, i64* @var1_64
+ br label %if.end
+
+if.end:
+ store i64 %cst1, i64* @var3_64
+ ret i64 0
+}
+
+ at var1_64 = common global i64 0, align 4
+ at var2_64 = common global i64 0, align 4
+ at var3_64 = common global i64 0, align 4
More information about the llvm-commits
mailing list