[llvm] [AArch64][GlobalISel] Avoid running the shl(zext(a), C) -> zext(shl(a, C)) combine. (PR #67045)

Thu Sep 21 10:28:01 PDT 2023

https://github.com/aemerson created https://github.com/llvm/llvm-project/pull/67045

This combine moves shifts inwards which narrows them. For AArch64 however doing so
prevents us from using our extended-register operands, since they only work
with shifts.

Gives some code size savings on -Os CTMark:

Program                                       size.__text
                                              before         after           diff
SPASS/SPASS                                   410616.00      410616.00       0.0%
kimwitu++/kc                                  453636.00      453636.00       0.0%
tramp3d-v4/tramp3d-v4                         393808.00      393808.00       0.0%
mafft/pairlocalalign                          244284.00      244280.00      -0.0%
sqlite3/sqlite3                               287832.00      287800.00      -0.0%
Bullet/bullet                                 461144.00      461092.00      -0.0%
consumer-typeset/consumer-typeset             412220.00      412164.00      -0.0%
7zip/7zip-benchmark                           593512.00      593364.00      -0.0%
ClamAV/clamscan                               381964.00      381836.00      -0.0%
lencod/lencod                                 428060.00      427836.00      -0.1%
                           Geomean difference                               -0.0%


>From 53eb7498c05c4487e84554fbf212b607f04f228b Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Thu, 21 Sep 2023 09:31:39 -0700
Subject: [PATCH] [AArch64][GlobalISel] Avoid running the shl(zext(a), C) ->
 zext(shl(a, C)) combine.

This combine moves shifts inwards which narrows them. For AArch64 however doing so
prevents us from using our extended-register operands, since they only work
with shifts.

Gives some code size savings on -Os CTMark:

Program                                       size.__text
                                              before         after           diff
SPASS/SPASS                                   410616.00      410616.00       0.0%
kimwitu++/kc                                  453636.00      453636.00       0.0%
tramp3d-v4/tramp3d-v4                         393808.00      393808.00       0.0%
mafft/pairlocalalign                          244284.00      244280.00      -0.0%
sqlite3/sqlite3                               287832.00      287800.00      -0.0%
Bullet/bullet                                 461144.00      461092.00      -0.0%
consumer-typeset/consumer-typeset             412220.00      412164.00      -0.0%
7zip/7zip-benchmark                           593512.00      593364.00      -0.0%
ClamAV/clamscan                               381964.00      381836.00      -0.0%
lencod/lencod                                 428060.00      427836.00      -0.1%
                           Geomean difference                               -0.0%
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  6 ++++++
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  2 ++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  4 ++++
 .../GlobalISel/no-reduce-shl-of-ext.ll        | 19 +++++++++++++++++++
 4 files changed, 31 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/no-reduce-shl-of-ext.ll

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index bd802bd4b173a0b..477b31cb776e68a 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4129,6 +4129,12 @@ class TargetLowering : public TargetLoweringBase {
     return true;
   }
 
+  /// GlobalISel - return true if it's profitable to perform the combine:
+  /// shl ([sza]ext x), y => zext (shl x, y)
+  virtual bool isDesirableToPullExtFromShl(const MachineInstr &MI) const {
+    return true;
+  }
+
   // Return AndOrSETCCFoldKind::{AddAnd, ABS} if its desirable to try and
   // optimize LogicOp(SETCC0, SETCC1). An example (what is implemented as of
   // writing this) is:
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 2ce68950424095e..f79944e824575a1 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1719,6 +1719,8 @@ void CombinerHelper::applyCombineMulToShl(MachineInstr &MI,
 bool CombinerHelper::matchCombineShlOfExtend(MachineInstr &MI,
                                              RegisterImmPair &MatchData) {
   assert(MI.getOpcode() == TargetOpcode::G_SHL && KB);
+  if (!getTargetLowering().isDesirableToPullExtFromShl(MI))
+    return false;
 
   Register LHS = MI.getOperand(1).getReg();
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index e015f68dabc6977..bdde4b5e8e00f87 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -690,6 +690,10 @@ class AArch64TargetLowering : public TargetLowering {
   bool isDesirableToCommuteWithShift(const SDNode *N,
                                      CombineLevel Level) const override;
 
+  bool isDesirableToPullExtFromShl(const MachineInstr &MI) const override {
+    return false;
+  }
+
   /// Returns false if N is a bit extraction pattern of (X >> C) & Mask.
   bool isDesirableToCommuteXorWithShift(const SDNode *N) const override;
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/no-reduce-shl-of-ext.ll b/llvm/test/CodeGen/AArch64/GlobalISel/no-reduce-shl-of-ext.ll
new file mode 100644
index 000000000000000..ab009cb7cc0e305
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/no-reduce-shl-of-ext.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc %s -verify-machineinstrs -mtriple aarch64-apple-darwin -global-isel -o - | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+%struct.mszip_stream = type { i32, i32, i8, i32, ptr, i32, i32, i32, i32, ptr, ptr, ptr, ptr, ptr, i32, i32, i32, [288 x i8], [32 x i8], [1152 x i16], [128 x i16], [32768 x i8], ptr, ptr }
+
+define i16 @test(i32 %bit_buffer.6.lcssa, ptr %zip, ptr %.out) {
+; CHECK-LABEL: test:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    and w8, w0, #0x1ff
+; CHECK-NEXT:    add x8, x1, w8, uxtw #1
+; CHECK-NEXT:    ldrh w0, [x8, #412]
+; CHECK-NEXT:    ret
+  %and274 = and i32 %bit_buffer.6.lcssa, 511
+  %idxprom275 = zext i32 %and274 to i64
+  %arrayidx276 = getelementptr inbounds %struct.mszip_stream, ptr %zip, i64 0, i32 19, i64 %idxprom275
+  %ld = load i16, ptr %arrayidx276, align 2
+  ret i16 %ld
+}