[llvm] [AArch64][GlobalISel] Avoid running the shl(zext(a), C) -> zext(shl(a, C)) combine. (PR #67045)
Amara Emerson via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 21 10:28:01 PDT 2023
https://github.com/aemerson created https://github.com/llvm/llvm-project/pull/67045
This combine moves shifts inwards which narrows them. For AArch64 however doing so
prevents us from using our extended-register operands, since they only work
with shifts.
Gives some code size savings on -Os CTMark:
Program size.__text
before after diff
SPASS/SPASS 410616.00 410616.00 0.0%
kimwitu++/kc 453636.00 453636.00 0.0%
tramp3d-v4/tramp3d-v4 393808.00 393808.00 0.0%
mafft/pairlocalalign 244284.00 244280.00 -0.0%
sqlite3/sqlite3 287832.00 287800.00 -0.0%
Bullet/bullet 461144.00 461092.00 -0.0%
consumer-typeset/consumer-typeset 412220.00 412164.00 -0.0%
7zip/7zip-benchmark 593512.00 593364.00 -0.0%
ClamAV/clamscan 381964.00 381836.00 -0.0%
lencod/lencod 428060.00 427836.00 -0.1%
Geomean difference -0.0%
>From 53eb7498c05c4487e84554fbf212b607f04f228b Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Thu, 21 Sep 2023 09:31:39 -0700
Subject: [PATCH] [AArch64][GlobalISel] Avoid running the shl(zext(a), C) ->
zext(shl(a, C)) combine.
This combine moves shifts inwards which narrows them. For AArch64 however doing so
prevents us from using our extended-register operands, since they only work
with shifts.
Gives some code size savings on -Os CTMark:
Program size.__text
before after diff
SPASS/SPASS 410616.00 410616.00 0.0%
kimwitu++/kc 453636.00 453636.00 0.0%
tramp3d-v4/tramp3d-v4 393808.00 393808.00 0.0%
mafft/pairlocalalign 244284.00 244280.00 -0.0%
sqlite3/sqlite3 287832.00 287800.00 -0.0%
Bullet/bullet 461144.00 461092.00 -0.0%
consumer-typeset/consumer-typeset 412220.00 412164.00 -0.0%
7zip/7zip-benchmark 593512.00 593364.00 -0.0%
ClamAV/clamscan 381964.00 381836.00 -0.0%
lencod/lencod 428060.00 427836.00 -0.1%
Geomean difference -0.0%
---
llvm/include/llvm/CodeGen/TargetLowering.h | 6 ++++++
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 2 ++
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 4 ++++
.../GlobalISel/no-reduce-shl-of-ext.ll | 19 +++++++++++++++++++
4 files changed, 31 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/no-reduce-shl-of-ext.ll
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index bd802bd4b173a0b..477b31cb776e68a 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4129,6 +4129,12 @@ class TargetLowering : public TargetLoweringBase {
return true;
}
+ /// GlobalISel - return true if it's profitable to perform the combine:
+ /// shl ([sza]ext x), y => zext (shl x, y)
+ virtual bool isDesirableToPullExtFromShl(const MachineInstr &MI) const {
+ return true;
+ }
+
// Return AndOrSETCCFoldKind::{AddAnd, ABS} if its desirable to try and
// optimize LogicOp(SETCC0, SETCC1). An example (what is implemented as of
// writing this) is:
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 2ce68950424095e..f79944e824575a1 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1719,6 +1719,8 @@ void CombinerHelper::applyCombineMulToShl(MachineInstr &MI,
bool CombinerHelper::matchCombineShlOfExtend(MachineInstr &MI,
RegisterImmPair &MatchData) {
assert(MI.getOpcode() == TargetOpcode::G_SHL && KB);
+ if (!getTargetLowering().isDesirableToPullExtFromShl(MI))
+ return false;
Register LHS = MI.getOperand(1).getReg();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index e015f68dabc6977..bdde4b5e8e00f87 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -690,6 +690,10 @@ class AArch64TargetLowering : public TargetLowering {
bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const override;
+ bool isDesirableToPullExtFromShl(const MachineInstr &MI) const override {
+ return false;
+ }
+
/// Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/no-reduce-shl-of-ext.ll b/llvm/test/CodeGen/AArch64/GlobalISel/no-reduce-shl-of-ext.ll
new file mode 100644
index 000000000000000..ab009cb7cc0e305
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/no-reduce-shl-of-ext.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc %s -verify-machineinstrs -mtriple aarch64-apple-darwin -global-isel -o - | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+%struct.mszip_stream = type { i32, i32, i8, i32, ptr, i32, i32, i32, i32, ptr, ptr, ptr, ptr, ptr, i32, i32, i32, [288 x i8], [32 x i8], [1152 x i16], [128 x i16], [32768 x i8], ptr, ptr }
+
+define i16 @test(i32 %bit_buffer.6.lcssa, ptr %zip, ptr %.out) {
+; CHECK-LABEL: test:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: and w8, w0, #0x1ff
+; CHECK-NEXT: add x8, x1, w8, uxtw #1
+; CHECK-NEXT: ldrh w0, [x8, #412]
+; CHECK-NEXT: ret
+ %and274 = and i32 %bit_buffer.6.lcssa, 511
+ %idxprom275 = zext i32 %and274 to i64
+ %arrayidx276 = getelementptr inbounds %struct.mszip_stream, ptr %zip, i64 0, i32 19, i64 %idxprom275
+ %ld = load i16, ptr %arrayidx276, align 2
+ ret i16 %ld
+}
More information about the llvm-commits
mailing list