[llvm] 086601e - [GlobalISel] Implement some binary reassociations, G_ADD for now
Amara Emerson via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 8 21:38:14 PDT 2023
Author: Amara Emerson
Date: 2023-06-08T21:14:58-07:00
New Revision: 086601eac266ec253bf313c746390ff3e5656132
URL: https://github.com/llvm/llvm-project/commit/086601eac266ec253bf313c746390ff3e5656132
DIFF: https://github.com/llvm/llvm-project/commit/086601eac266ec253bf313c746390ff3e5656132.diff
LOG: [GlobalISel] Implement some binary reassociations, G_ADD for now
- (op (op X, C1), C2) -> (op X, (op C1, C2))
- (op (op X, C1), Y) -> (op (op X, Y), C1)
Some code duplication with the G_PTR_ADD reassociations unfortunately but no
easy way to avoid it that I can see.
Differential Revision: https://reviews.llvm.org/D150230
Added:
llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-binop-reassoc.mir
Modified:
llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/include/llvm/Target/GlobalISel/Combine.td
llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.h
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index b6946936f8acc..0177d9d423be1 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -631,6 +631,12 @@ class CombinerHelper {
/// addressing mode usage.
bool matchReassocPtrAdd(MachineInstr &MI, BuildFnTy &MatchInfo);
+ /// Try to reassociate to reassociate operands of a commutative binop.
+ bool tryReassocBinOp(unsigned Opc, Register DstReg, Register Op0,
+ Register Op1, BuildFnTy &MatchInfo);
+ /// Reassociate commutative binary operations like G_ADD.
+ bool matchReassocCommBinOp(MachineInstr &MI, BuildFnTy &MatchInfo);
+
/// Do constant folding when opportunities are exposed after MIR building.
bool matchConstantFold(MachineInstr &MI, APInt &MatchInfo);
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index d0b9da0382ddf..0aa4fa2123369 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -32,6 +32,7 @@
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/LowLevelTypeUtils.h"
#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -3584,6 +3585,17 @@ class TargetLowering : public TargetLoweringBase {
return N0.hasOneUse();
}
+ // Lets target to control the following reassociation of operands: (op (op x,
+ // c1), y) -> (op (op x, y), c1) where N0 is (op x, c1) and N1 is y. By
+ // default consider profitable any case where N0 has single use. This
+ // behavior reflects the condition replaced by this target hook call in the
+ // combiner. Any particular target can implement its own heuristic to
+ // restrict common combiner.
+ virtual bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0,
+ Register N1) const {
+ return MRI.hasOneNonDBGUse(N0);
+ }
+
virtual bool isSDNodeAlwaysUniform(const SDNode * N) const {
return false;
}
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index ed4d3f3c0f8f9..5bd92c856999e 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -862,7 +862,13 @@ def reassoc_ptradd : GICombineRule<
[{ return Helper.matchReassocPtrAdd(*${root}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>;
-def reassocs : GICombineGroup<[reassoc_ptradd]>;
+def reassoc_comm_binops : GICombineRule<
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (G_ADD $root, $src1, $src2):$root,
+ [{ return Helper.matchReassocCommBinOp(*${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
+def reassocs : GICombineGroup<[reassoc_ptradd, reassoc_comm_binops]>;
// Constant fold operations.
def constant_fold : GICombineRule<
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index d958c9c1dda04..94a756089f98a 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -4501,6 +4501,58 @@ bool CombinerHelper::matchReassocPtrAdd(MachineInstr &MI,
return false;
}
+bool CombinerHelper::tryReassocBinOp(unsigned Opc, Register DstReg,
+ Register OpLHS, Register OpRHS,
+ BuildFnTy &MatchInfo) {
+ LLT OpRHSTy = MRI.getType(OpRHS);
+ MachineInstr *OpLHSDef = MRI.getVRegDef(OpLHS);
+
+ if (OpLHSDef->getOpcode() != Opc)
+ return false;
+
+ MachineInstr *OpRHSDef = MRI.getVRegDef(OpRHS);
+ Register OpLHSLHS = OpLHSDef->getOperand(1).getReg();
+ Register OpLHSRHS = OpLHSDef->getOperand(2).getReg();
+
+ if (isConstantOrConstantSplatVector(*MRI.getVRegDef(OpLHSRHS), MRI)) {
+ if (isConstantOrConstantSplatVector(*OpRHSDef, MRI)) {
+ // (Opc (Opc X, C1), C2) -> (Opc X, (Opc C1, C2))
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto NewCst = B.buildInstr(Opc, {OpRHSTy}, {OpLHSRHS, OpRHS});
+ B.buildInstr(Opc, {DstReg}, {OpLHSLHS, NewCst});
+ };
+ return true;
+ }
+ if (getTargetLowering().isReassocProfitable(MRI, OpLHS, OpRHS) &&
+ MRI.hasOneNonDBGUse(OpLHSLHS)) {
+ // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
+ // iff (op x, c1) has one use
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto NewLHSLHS = B.buildInstr(Opc, {OpRHSTy}, {OpLHSLHS, OpRHS});
+ B.buildInstr(Opc, {DstReg}, {NewLHSLHS, OpLHSRHS});
+ };
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool CombinerHelper::matchReassocCommBinOp(MachineInstr &MI,
+ BuildFnTy &MatchInfo) {
+ // We don't check if the reassociation will break a legal addressing mode
+ // here since pointer arithmetic is handled by G_PTR_ADD.
+ unsigned Opc = MI.getOpcode();
+ Register DstReg = MI.getOperand(0).getReg();
+ Register LHSReg = MI.getOperand(1).getReg();
+ Register RHSReg = MI.getOperand(2).getReg();
+
+ if (tryReassocBinOp(Opc, DstReg, LHSReg, RHSReg, MatchInfo))
+ return true;
+ if (tryReassocBinOp(Opc, DstReg, RHSReg, LHSReg, MatchInfo))
+ return true;
+ return false;
+}
bool CombinerHelper::matchConstantFold(MachineInstr &MI, APInt &MatchInfo) {
Register Op1 = MI.getOperand(1).getReg();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 38a4bd9260913..09d9430aea49a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5158,6 +5158,11 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
}
}
+bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
+ Register N0, Register N1) const {
+ return true; // FIXME: handle regbanks
+}
+
TargetLowering::AtomicExpansionKind
AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
switch (RMW->getOperation()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index bc35c4ea0b3ea..c2d6a225a2618 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -292,6 +292,9 @@ class AMDGPUTargetLowering : public TargetLowering {
bool SNaN = false,
unsigned Depth = 0) const override;
+ bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0,
+ Register N1) const override;
+
/// Helper function that adds Reg to the LiveIn list of the DAG's
/// MachineFunction.
///
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f090b0c4c3278..8c24789d53c8b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13572,6 +13572,11 @@ bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
hasMemSDNodeUser(*N0->use_begin()));
}
+bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
+ Register N0, Register N1) const {
+ return true; // FIXME: handle regbanks
+}
+
MachineMemOperand::Flags
SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
// Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 4a18c349e8779..5dcb79d4d7ae7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -495,6 +495,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0,
SDValue N1) const override;
+ bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0,
+ Register N1) const override;
+
bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
unsigned MaxDepth = 5) const;
bool isCanonicalized(Register Reg, MachineFunction &MF,
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-binop-reassoc.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-binop-reassoc.mir
new file mode 100644
index 0000000000000..f1676e48e5113
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-binop-reassoc.mir
@@ -0,0 +1,124 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s
+
+# Combins: (Opc (Opc X, C1), C2) -> (Opc X, (Opc C1, C2))
+---
+name: test1_add_move_inner_cst_to_fold
+alignment: 4
+tracksRegLiveness: true
+liveins:
+ - { reg: '$x0' }
+body: |
+ bb.1:
+ liveins: $x0
+
+ ; CHECK-LABEL: name: test1_add_move_inner_cst_to_fold
+ ; CHECK: liveins: $x0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %x:_(s64) = COPY $x0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 88
+ ; CHECK-NEXT: %add_outer:_(s64) = G_ADD %x, [[C]]
+ ; CHECK-NEXT: $x0 = COPY %add_outer(s64)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %x:_(s64) = COPY $x0
+ %C1:_(s64) = G_CONSTANT i64 64
+ %C2:_(s64) = G_CONSTANT i64 24
+ %add_inner:_(s64) = G_ADD %x, %C1
+ %add_outer:_(s64) = G_ADD %add_inner, %C2
+ $x0 = COPY %add_outer
+ RET_ReallyLR implicit $x0
+
+...
+
+# (op (op x, c1), y) -> (op (op x, y), c1)
+---
+name: test2_add_move_inner_cst_to_rhs
+alignment: 4
+tracksRegLiveness: true
+liveins:
+ - { reg: '$x0' }
+body: |
+ bb.1:
+ liveins: $x0, $x1
+
+ ; CHECK-LABEL: name: test2_add_move_inner_cst_to_rhs
+ ; CHECK: liveins: $x0, $x1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %x:_(s64) = COPY $x0
+ ; CHECK-NEXT: %C1:_(s64) = G_CONSTANT i64 64
+ ; CHECK-NEXT: %y:_(s64) = COPY $x1
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD %x, %y
+ ; CHECK-NEXT: %add_outer:_(s64) = G_ADD [[ADD]], %C1
+ ; CHECK-NEXT: $x0 = COPY %add_outer(s64)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %x:_(s64) = COPY $x0
+ %C1:_(s64) = G_CONSTANT i64 64
+ %y:_(s64) = COPY $x1
+ %add_inner:_(s64) = G_ADD %x, %C1
+ %add_outer:_(s64) = G_ADD %add_inner, %y
+ $x0 = COPY %add_outer
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: test2_add_move_inner_cst_to_rhs_multiuse
+alignment: 4
+tracksRegLiveness: true
+liveins:
+ - { reg: '$x0' }
+body: |
+ bb.1:
+ liveins: $x0, $x1
+
+ ; CHECK-LABEL: name: test2_add_move_inner_cst_to_rhs_multiuse
+ ; CHECK: liveins: $x0, $x1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %x:_(s64) = COPY $x0
+ ; CHECK-NEXT: %C1:_(s64) = G_CONSTANT i64 64
+ ; CHECK-NEXT: %y:_(s64) = COPY $x1
+ ; CHECK-NEXT: %add_inner:_(s64) = G_ADD %x, %C1
+ ; CHECK-NEXT: %add_outer:_(s64) = G_ADD %add_inner, %y
+ ; CHECK-NEXT: $x0 = COPY %add_outer(s64)
+ ; CHECK-NEXT: $x1 = COPY %add_inner(s64)
+ ; CHECK-NEXT: RET_ReallyLR implicit $x0
+ %x:_(s64) = COPY $x0
+ %C1:_(s64) = G_CONSTANT i64 64
+ %y:_(s64) = COPY $x1
+ %add_inner:_(s64) = G_ADD %x, %C1
+ %add_outer:_(s64) = G_ADD %add_inner, %y
+ $x0 = COPY %add_outer
+ $x1 = COPY %add_inner
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: test2_add_move_inner_cst_to_rhs_vector
+alignment: 4
+tracksRegLiveness: true
+liveins:
+ - { reg: '$x0' }
+body: |
+ bb.1:
+ liveins: $q0, $q1
+
+ ; CHECK-LABEL: name: test2_add_move_inner_cst_to_rhs_vector
+ ; CHECK: liveins: $q0, $q1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %x:_(<2 x s64>) = COPY $q0
+ ; CHECK-NEXT: %C1:_(s64) = G_CONSTANT i64 64
+ ; CHECK-NEXT: %VEC_C1:_(<2 x s64>) = G_BUILD_VECTOR %C1(s64), %C1(s64)
+ ; CHECK-NEXT: %y:_(<2 x s64>) = COPY $q1
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD %x, %y
+ ; CHECK-NEXT: %add_outer:_(<2 x s64>) = G_ADD [[ADD]], %VEC_C1
+ ; CHECK-NEXT: $q0 = COPY %add_outer(<2 x s64>)
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %x:_(<2 x s64>) = COPY $q0
+ %C1:_(s64) = G_CONSTANT i64 64
+ %VEC_C1:_(<2 x s64>) = G_BUILD_VECTOR %C1, %C1
+ %y:_(<2 x s64>) = COPY $q1
+ %add_inner:_(<2 x s64>) = G_ADD %x, %VEC_C1
+ %add_outer:_(<2 x s64>) = G_ADD %add_inner, %y
+ $q0 = COPY %add_outer
+ RET_ReallyLR implicit $q0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
index 404931cb76615..9d0c6b0cb1d49 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
@@ -5016,9 +5016,10 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr
; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
- ; GFX6-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc
- ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
+ ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
; GFX7-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr
@@ -5032,9 +5033,10 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr
; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
- ; GFX7-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc
- ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
+ ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
; GFX8-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr
@@ -5048,9 +5050,10 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr
; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
- ; GFX8-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc
- ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
+ ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%offset.base = add i32 %offset.s, 1024
@@ -5071,10 +5074,10 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
- ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
- ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
; GFX7-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr
@@ -5088,10 +5091,10 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
- ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
- ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
; GFX8-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr
@@ -5105,10 +5108,10 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
- ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
- ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
+ ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%offset.base = add i32 %offset.v, 1024
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
index 4b82c99e39732..8c24a28211b70 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
@@ -3057,12 +3057,14 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
- ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY5]], [[C]]
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
- ; CHECK-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
- ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[ADD]], 0, 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; CHECK-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr
@@ -3076,12 +3078,14 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr
; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; GREEDY-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
+ ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
+ ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
- ; GREEDY-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY5]], [[C]]
- ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
- ; GREEDY-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
- ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
- ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[ADD]], 0, 0, 0 :: (dereferenceable invariant load (s32))
+ ; GREEDY-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; GREEDY-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
+ ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%offset.base = add i32 %offset.s, 1024
@@ -3102,13 +3106,14 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; CHECK-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
- ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr
@@ -3122,13 +3127,14 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; GREEDY-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
- ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
- ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
- ; GREEDY-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
+ ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
+ ; GREEDY-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; GREEDY-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
- ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
- ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32))
+ ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%offset.base = add i32 %offset.v, 1024
More information about the llvm-commits
mailing list