[llvm] e1472db - [GlobalISel] Implement commuting shl (add/or x, c1), c2 -> add/or (shl x, c2), c1 << c2
Amara Emerson via llvm-commits
llvm-commits at lists.llvm.org
Mon May 8 22:38:11 PDT 2023
Author: Amara Emerson
Date: 2023-05-08T22:37:43-07:00
New Revision: e1472db58ef501264547ac8c97be2bf7c481ec04
URL: https://github.com/llvm/llvm-project/commit/e1472db58ef501264547ac8c97be2bf7c481ec04
DIFF: https://github.com/llvm/llvm-project/commit/e1472db58ef501264547ac8c97be2bf7c481ec04.diff
LOG: [GlobalISel] Implement commuting shl (add/or x, c1), c2 -> add/or (shl x, c2), c1 << c2
There's a target hook that's called in DAGCombiner that we stub here, I'll
implement the equivalent override for AArch64 in a subsequent patch since it's
used by different shift combine.
This change by itself has minor code size improvements on arm64 -Os CTMark:
Program size.__text
outputg181ppyy output8av1cxfn diff
consumer-typeset/consumer-typeset 410648.00 410648.00 0.0%
tramp3d-v4/tramp3d-v4 364176.00 364176.00 0.0%
kimwitu++/kc 449216.00 449212.00 -0.0%
7zip/7zip-benchmark 576128.00 576120.00 -0.0%
sqlite3/sqlite3 285108.00 285100.00 -0.0%
SPASS/SPASS 411720.00 411688.00 -0.0%
ClamAV/clamscan 379868.00 379764.00 -0.0%
Bullet/bullet 452064.00 451928.00 -0.0%
mafft/pairlocalalign 246184.00 246108.00 -0.0%
lencod/lencod 428524.00 428152.00 -0.1%
Geomean difference -0.0%
Differential Revision: https://reviews.llvm.org/D150086
Added:
llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-commute-shift.mir
Modified:
llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/include/llvm/Target/GlobalISel/Combine.td
llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index f6c0764a6426c..ec3762094eacb 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -302,6 +302,8 @@ class CombinerHelper {
void applyShiftOfShiftedLogic(MachineInstr &MI,
ShiftOfShiftedLogic &MatchInfo);
+ bool matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo);
+
/// Transform a multiply by a power-of-2 value to a left shift.
bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal);
void applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal);
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 355091146d240..c458c820c8a18 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4034,6 +4034,19 @@ class TargetLowering : public TargetLoweringBase {
return true;
}
+ /// GlobalISel - return true if it is profitable to move this shift by a
+ /// constant amount through its operand, adjusting any immediate operands as
+ /// necessary to preserve semantics. This transformation may not be desirable
+ /// if it disrupts a particularly auspicious target-specific tree (e.g.
+ /// bitfield extraction in AArch64). By default, it returns true.
+ ///
+ /// @param MI the shift instruction
+ /// @param IsAfterLegal true if running after legalization.
+ virtual bool isDesirableToCommuteWithShift(const MachineInstr &MI,
+ bool IsAfterLegal) const {
+ return true;
+ }
+
// Return AndOrSETCCFoldKind::{AddAnd, ABS} if its desirable to try and
// optimize LogicOp(SETCC0, SETCC1). An example (what is implemented as of
// writing this) is:
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index c2054a689bd88..0c7048312e908 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -243,6 +243,14 @@ def reduce_shl_of_extend : GICombineRule<
[{ return Helper.matchCombineShlOfExtend(*${mi}, ${matchinfo}); }]),
(apply [{ Helper.applyCombineShlOfExtend(*${mi}, ${matchinfo}); }])>;
+// Combine (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
+// Combine (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
+def commute_shift : GICombineRule<
+ (defs root:$d, build_fn_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_SHL):$d,
+ [{ return Helper.matchCommuteShift(*${d}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${d}, ${matchinfo}); }])>;
+
def narrow_binop_feeding_and : GICombineRule<
(defs root:$root, build_fn_matchinfo:$matchinfo),
(match (wip_match_opcode G_AND):$root,
@@ -1097,7 +1105,7 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
unmerge_zext_to_zext, merge_unmerge, trunc_ext_fold, trunc_shift,
const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine,
- div_rem_to_divrem, funnel_shift_combines,
+ div_rem_to_divrem, funnel_shift_combines, commute_shift,
form_bitfield_extract, constant_fold, fabs_fneg_fold,
intdiv_combines, mulh_combines, redundant_neg_operands,
and_or_disjoint_mask, fma_combines, fold_binop_into_select,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 0ddec2599803c..c7636c1a6ea3a 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1624,6 +1624,41 @@ void CombinerHelper::applyShiftOfShiftedLogic(MachineInstr &MI,
MI.eraseFromParent();
}
+bool CombinerHelper::matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHL && "Expected G_SHL");
+ // Combine (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
+ // Combine (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
+ auto &Shl = cast<GenericMachineInstr>(MI);
+ Register DstReg = Shl.getReg(0);
+ Register SrcReg = Shl.getReg(1);
+ Register ShiftReg = Shl.getReg(2);
+ Register X, C1;
+
+ if (!getTargetLowering().isDesirableToCommuteWithShift(MI, !isPreLegalize()))
+ return false;
+
+ if (!mi_match(SrcReg, MRI,
+ m_OneNonDBGUse(m_any_of(m_GAdd(m_Reg(X), m_Reg(C1)),
+ m_GOr(m_Reg(X), m_Reg(C1))))))
+ return false;
+
+ APInt C1Val, C2Val;
+ if (!mi_match(C1, MRI, m_ICstOrSplat(C1Val)) ||
+ !mi_match(ShiftReg, MRI, m_ICstOrSplat(C2Val)))
+ return false;
+
+ auto *SrcDef = MRI.getVRegDef(SrcReg);
+ assert(SrcDef->getOpcode() == TargetOpcode::G_ADD ||
+ SrcDef->getOpcode() == TargetOpcode::G_OR && "Unexpected op");
+ LLT SrcTy = MRI.getType(SrcReg);
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto S1 = B.buildShl(SrcTy, X, ShiftReg);
+ auto S2 = B.buildShl(SrcTy, C1, ShiftReg);
+ B.buildInstr(SrcDef->getOpcode(), {DstReg}, {S1, S2});
+ };
+ return true;
+}
+
bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI,
unsigned &ShiftVal) {
assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-commute-shift.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-commute-shift.mir
new file mode 100644
index 0000000000000..d4800748374e0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-commute-shift.mir
@@ -0,0 +1,128 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple aarch64 -mattr=+fullfp16 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+---
+name: shl_add_k
+alignment: 4
+tracksRegLiveness: true
+body: |
+ bb.1:
+ liveins: $w1, $x0
+
+ ; CHECK-LABEL: name: shl_add_k
+ ; CHECK: liveins: $w1, $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[C1]]
+ ; CHECK-NEXT: G_STORE [[ADD]](s32), [[COPY]](p0) :: (store (s32))
+ ; CHECK-NEXT: RET_ReallyLR
+ %0:_(p0) = COPY $x0
+ %1:_(s32) = COPY $w1
+ %2:_(s32) = G_CONSTANT i32 1
+ %4:_(s32) = G_CONSTANT i32 2
+ %3:_(s32) = G_ADD %1, %2
+ %5:_(s32) = G_SHL %3, %4(s32)
+ G_STORE %5(s32), %0(p0) :: (store (s32))
+ RET_ReallyLR
+
+...
+---
+name: shl_or_k
+alignment: 4
+tracksRegLiveness: true
+body: |
+ bb.1:
+ liveins: $w1, $x0
+
+ ; CHECK-LABEL: name: shl_or_k
+ ; CHECK: liveins: $w1, $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[C1]]
+ ; CHECK-NEXT: G_STORE [[OR]](s32), [[COPY]](p0) :: (store (s32))
+ ; CHECK-NEXT: RET_ReallyLR
+ %0:_(p0) = COPY $x0
+ %1:_(s32) = COPY $w1
+ %2:_(s32) = G_CONSTANT i32 1
+ %4:_(s32) = G_CONSTANT i32 2
+ %3:_(s32) = G_OR %1, %2
+ %5:_(s32) = G_SHL %3, %4(s32)
+ G_STORE %5(s32), %0(p0) :: (store (s32))
+ RET_ReallyLR
+
+...
+---
+name: shl_or_k_multiuse
+alignment: 4
+tracksRegLiveness: true
+body: |
+ bb.1:
+ liveins: $w1, $x0
+
+ ; CHECK-LABEL: name: shl_or_k_multiuse
+ ; CHECK: liveins: $w1, $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK-NEXT: %ptr:_(p0) = COPY $x1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[C]]
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[OR]], [[C1]](s32)
+ ; CHECK-NEXT: G_STORE [[SHL]](s32), [[COPY]](p0) :: (store (s32))
+ ; CHECK-NEXT: G_STORE [[OR]](s32), %ptr(p0) :: (store (s32))
+ ; CHECK-NEXT: RET_ReallyLR
+ %0:_(p0) = COPY $x0
+ %ptr:_(p0) = COPY $x1
+ %1:_(s32) = COPY $w1
+ %2:_(s32) = G_CONSTANT i32 1
+ %4:_(s32) = G_CONSTANT i32 2
+ %3:_(s32) = G_OR %1, %2
+ %5:_(s32) = G_SHL %3, %4(s32)
+ G_STORE %5(s32), %0(p0) :: (store (s32))
+ G_STORE %3(s32), %ptr(p0) :: (store (s32))
+ RET_ReallyLR
+
+...
+---
+name: shl_add_k_vector
+alignment: 4
+tracksRegLiveness: true
+body: |
+ bb.1:
+ liveins: $w1, $x0
+
+ ; CHECK-LABEL: name: shl_add_k_vector
+ ; CHECK: liveins: $w1, $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+ ; CHECK-NEXT: %xvec:_(<4 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY1]](s32), [[COPY1]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; CHECK-NEXT: %veccst2:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(<4 x s32>) = G_SHL %xvec, %veccst2(<4 x s32>)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32)
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<4 x s32>) = G_ADD [[SHL]], [[BUILD_VECTOR]]
+ ; CHECK-NEXT: G_STORE [[ADD]](<4 x s32>), [[COPY]](p0) :: (store (<4 x s32>))
+ ; CHECK-NEXT: RET_ReallyLR
+ %0:_(p0) = COPY $x0
+ %1:_(s32) = COPY $w1
+ %xvec:_(<4 x s32>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %2:_(s32) = G_CONSTANT i32 1
+ %veccst:_(<4 x s32>) = G_BUILD_VECTOR %2, %2, %2, %2
+ %4:_(s32) = G_CONSTANT i32 2
+ %veccst2:_(<4 x s32>) = G_BUILD_VECTOR %4, %4, %4, %4
+ %3:_(<4 x s32>) = G_ADD %xvec, %veccst2
+ %5:_(<4 x s32>) = G_SHL %3, %veccst2
+ G_STORE %5(<4 x s32>), %0(p0) :: (store (<4 x s32>))
+ RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll
index 141a31cbe9e72..a727ed39c79c6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll
@@ -101,19 +101,19 @@ define amdgpu_ps float @add_shl_vgpr_const(i32 %a, i32 %b) {
define amdgpu_ps float @add_shl_vgpr_const_inline_const(i32 %a) {
; VI-LABEL: add_shl_vgpr_const_inline_const:
; VI: ; %bb.0:
-; VI-NEXT: v_add_u32_e32 v0, vcc, 0x3f4, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7e800, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: add_shl_vgpr_const_inline_const:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x3f4
-; GFX9-NEXT: v_add_lshl_u32 v0, v0, v1, 9
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e800
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 9, v1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: add_shl_vgpr_const_inline_const:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_add_lshl_u32 v0, 0x3f4, v0, 9
+; GFX10-NEXT: v_lshl_add_u32 v0, v0, 9, 0x7e800
; GFX10-NEXT: ; return to shader part epilog
%x = add i32 %a, 1012
%result = shl i32 %x, 9
@@ -124,18 +124,19 @@ define amdgpu_ps float @add_shl_vgpr_const_inline_const(i32 %a) {
define amdgpu_ps float @add_shl_vgpr_inline_const_x2(i32 %a) {
; VI-LABEL: add_shl_vgpr_inline_const_x2:
; VI: ; %bb.0:
-; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, 0x600, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: add_shl_vgpr_inline_const_x2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_add_lshl_u32 v0, v0, 3, 9
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x600
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 9, v1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: add_shl_vgpr_inline_const_x2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_add_lshl_u32 v0, v0, 3, 9
+; GFX10-NEXT: v_lshl_add_u32 v0, v0, 9, 0x600
; GFX10-NEXT: ; return to shader part epilog
%x = add i32 %a, 3
%result = shl i32 %x, 9
More information about the llvm-commits
mailing list