[clang-tools-extra] [AMDGPU] Rematerialize scalar loads (PR #68778)
Piotr Sobczak via cfe-commits
cfe-commits at lists.llvm.org
Wed Oct 25 04:05:15 PDT 2023
https://github.com/piotrAMD updated https://github.com/llvm/llvm-project/pull/68778
>From 6b5ada294d999ba1412020806ce8fab8e34a408e Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Wed, 11 Oct 2023 10:32:57 +0200
Subject: [PATCH 1/5] [AMDGPU] Rematerialize scalar loads
Extend the list of instructions that can be rematerialized in
SIInstrInfo::isReallyTriviallyReMaterializable() to support scalar loads.
Try shrinking instructions to remat only the part needed for current
context. Add SIInstrInfo::reMaterialize target hook, and handle shrinking
of S_LOAD_DWORDX16_IMM to S_LOAD_DWORDX8_IMM as a proof of concept.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 110 +++++++++++++++++
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 5 +
.../hsa-metadata-kernel-code-props-v3.ll | 4 +-
llvm/test/CodeGen/AMDGPU/remat-smrd.mir | 116 +++++-------------
.../AMDGPU/snippet-copy-bundle-regression.mir | 42 +------
5 files changed, 151 insertions(+), 126 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 51397cbb791469d..31a086cd2a61820 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -108,7 +108,18 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
bool SIInstrInfo::isReallyTriviallyReMaterializable(
const MachineInstr &MI) const {
+
+ bool CanRemat = false;
if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) {
+ CanRemat = true;
+ } else if (isSMRD(MI)) {
+ CanRemat = !MI.memoperands_empty() &&
+ llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
+ return MMO->isLoad() && MMO->isInvariant();
+ });
+ }
+
+ if (CanRemat) {
// Normally VALU use of exec would block the rematerialization, but that
// is OK in this case to have an implicit exec read as all VALU do.
// We really want all of the generic logic for this except for this.
@@ -2434,6 +2445,105 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
+void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, Register DestReg,
+ unsigned SubIdx, const MachineInstr &Orig,
+ const TargetRegisterInfo &RI) const {
+
+ // Try shrinking the instruction to remat only the part needed for current
+ // context.
+ // TODO: Handle more cases.
+ unsigned Opcode = Orig.getOpcode();
+ switch (Opcode) {
+ case AMDGPU::S_LOAD_DWORDX16_IMM:
+ case AMDGPU::S_LOAD_DWORDX8_IMM: {
+ if (SubIdx != 0)
+ break;
+
+ if (I == MBB.end())
+ break;
+
+ if (I->isBundled())
+ break;
+
+ // Look for a single use of the register that is also a subreg.
+ Register RegToFind = Orig.getOperand(0).getReg();
+ int SingleUseIdx = -1;
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+ const MachineOperand &CandMO = I->getOperand(i);
+ if (!CandMO.isReg())
+ continue;
+ Register CandReg = CandMO.getReg();
+ if (!CandReg)
+ continue;
+
+ if (CandReg == RegToFind || RI.regsOverlap(CandReg, RegToFind)) {
+ if (SingleUseIdx == -1 && CandMO.isUse()) {
+ SingleUseIdx = i;
+ } else {
+ SingleUseIdx = -1;
+ break;
+ }
+ }
+ }
+ if (SingleUseIdx == -1)
+ break;
+ MachineOperand *UseMO = &I->getOperand(SingleUseIdx);
+ if (UseMO->getSubReg() == AMDGPU::NoSubRegister)
+ break;
+
+ unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
+ unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
+
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ assert(MRI.hasAtMostUserInstrs(DestReg, 0) &&
+ "DestReg should have no users yet.");
+
+ unsigned NewOpcode = -1;
+ if (SubregSize == 256)
+ NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
+ else if (SubregSize == 128)
+ NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
+ else
+ break;
+
+ const MCInstrDesc &TID = get(NewOpcode);
+ const TargetRegisterClass *NewRC =
+ RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
+ MRI.setRegClass(DestReg, NewRC);
+
+ UseMO->setReg(DestReg);
+ UseMO->setSubReg(AMDGPU::NoSubRegister);
+
+ // Use a smaller load with the desired size, possibly with updated offset.
+ MachineInstr *MI = MF->CloneMachineInstr(&Orig);
+ MI->setDesc(TID);
+ MI->getOperand(0).setReg(DestReg);
+ MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
+ if (Offset) {
+ MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
+ int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
+ OffsetMO->setImm(FinalOffset);
+ }
+ SmallVector<MachineMemOperand *> NewMMOs;
+ for (const MachineMemOperand *MemOp : Orig.memoperands())
+ NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
+ SubregSize / 8));
+ MI->setMemRefs(*MF, NewMMOs);
+
+ MBB.insert(I, MI);
+ return;
+ }
+
+ default:
+ break;
+ }
+ MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
+ MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, RI);
+ MBB.insert(I, MI);
+}
+
std::pair<MachineInstr*, MachineInstr*>
SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5ef17c44f7de389..a64cf0244e4c08c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -275,6 +275,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
bool expandPostRAPseudo(MachineInstr &MI) const override;
+ void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ Register DestReg, unsigned SubIdx,
+ const MachineInstr &Orig,
+ const TargetRegisterInfo &TRI) const override;
+
// Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
// instructions. Returns a pair of generated instructions.
// Can split either post-RA with physical registers or pre-RA with
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
index d6f7a92af9dcb6f..1999c7b065e6854 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
@@ -47,8 +47,8 @@ entry:
}
; CHECK: .name: num_spilled_sgprs
-; GFX700: .sgpr_spill_count: 38
-; GFX803: .sgpr_spill_count: 22
+; GFX700: .sgpr_spill_count: 12
+; GFX803: .sgpr_spill_count: 12
; GFX900: .sgpr_spill_count: 48
; GFX1010: .sgpr_spill_count: 48
; CHECK: .symbol: num_spilled_sgprs.kd
diff --git a/llvm/test/CodeGen/AMDGPU/remat-smrd.mir b/llvm/test/CodeGen/AMDGPU/remat-smrd.mir
index 753cb135fdae9c0..95eac12a65389ef 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-smrd.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-smrd.mir
@@ -12,15 +12,11 @@ body: |
; GCN: liveins: $sgpr8_sgpr9
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY $sgpr8_sgpr9
- ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 4, 0 :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
- ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
- ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -44,16 +40,10 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s64), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr2_sgpr3, %stack.1, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.1, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s64), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.0, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s64), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr2_sgpr3, %stack.2, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.2, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.1, align 4, addrspace 5)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
- ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s64), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
- ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.2, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s64), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -77,16 +67,10 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s128), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S128_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sp_reg :: (store (s128) into %stack.1, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s128), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S128_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, %stack.0, implicit $exec, implicit $sp_reg :: (store (s128) into %stack.0, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s128), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S128_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, %stack.2, implicit $exec, implicit $sp_reg :: (store (s128) into %stack.2, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.1, align 4, addrspace 5)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s128), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.2, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s128), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -110,16 +94,10 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s256), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.1, implicit $exec, implicit $sp_reg :: (store (s256) into %stack.1, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s256), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.0, implicit $exec, implicit $sp_reg :: (store (s256) into %stack.0, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s256), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.2, implicit $exec, implicit $sp_reg :: (store (s256) into %stack.2, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s256) from %stack.1, align 4, addrspace 5)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s256) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s256), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s256) from %stack.2, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s256), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -143,16 +121,10 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s512), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.1, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.1, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s512), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.0, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.0, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s512), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.2, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.2, align 4, addrspace 5)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.1, align 4, addrspace 5)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s512), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.2, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s512), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -175,13 +147,12 @@ body: |
; GCN: liveins: $sgpr8_sgpr9, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
- ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.0, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.0, align 4, addrspace 5)
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 32, 0 :: (dereferenceable invariant load (s256), align 4, addrspace 4)
+ ; GCN-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = COPY killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s256), align 4, addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -206,14 +177,12 @@ body: |
; GCN: liveins: $sgpr8_sgpr9, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
- ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.0, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.0, align 4, addrspace 5)
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
; GCN-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = KILL killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, implicit renamable $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s256), align 4, addrspace 4)
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = KILL killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, implicit renamable $sgpr0_sgpr1
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
@@ -238,10 +207,8 @@ body: |
; GCN: liveins: $sgpr8_sgpr9, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
- ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.0, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.0, align 4, addrspace 5)
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -265,13 +232,12 @@ body: |
; GCN: liveins: $sgpr8_sgpr9, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s256), align 4, addrspace 4)
- ; GCN-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.0, implicit $exec, implicit $sp_reg :: (store (s256) into %stack.0, align 4, addrspace 5)
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s256), align 4, addrspace 4)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s256) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 16, 0 :: (dereferenceable invariant load (s128), align 4, addrspace 4)
+ ; GCN-NEXT: renamable $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr8_sgpr9_sgpr10_sgpr11
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s256), align 4, addrspace 4)
- ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s256) from %stack.0, align 4, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s128), align 4, addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -331,16 +297,10 @@ body: |
; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY $sgpr8_sgpr9
; GCN-NEXT: renamable $sgpr0 = COPY $sgpr10
; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR renamable $sgpr2_sgpr3, renamable $sgpr0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
- ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR renamable $sgpr2_sgpr3, renamable $sgpr0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
- ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR renamable $sgpr2_sgpr3, renamable $sgpr0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.2, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.2, addrspace 5)
- ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
- ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR renamable $sgpr2_sgpr3, renamable $sgpr0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
- ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.2, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR renamable $sgpr2_sgpr3, renamable $sgpr0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3, implicit killed renamable $sgpr0
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -366,16 +326,10 @@ body: |
; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY $sgpr8_sgpr9
; GCN-NEXT: renamable $sgpr0 = COPY $sgpr10
; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR_IMM renamable $sgpr2_sgpr3, renamable $sgpr0, 0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
- ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR_IMM renamable $sgpr2_sgpr3, renamable $sgpr0, 4, 0 :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
- ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR_IMM renamable $sgpr2_sgpr3, renamable $sgpr0, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.2, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.2, addrspace 5)
- ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
- ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR_IMM renamable $sgpr2_sgpr3, renamable $sgpr0, 4, 0 :: (dereferenceable invariant load (s32), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
- ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.2, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR_IMM renamable $sgpr2_sgpr3, renamable $sgpr0, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3, implicit killed renamable $sgpr0
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -430,15 +384,11 @@ body: |
; GCN: liveins: $sgpr8_sgpr9_sgpr10_sgpr11
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = COPY $sgpr8_sgpr9_sgpr10_sgpr11
- ; GCN-NEXT: renamable $sgpr0 = S_BUFFER_LOAD_DWORD_IMM renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: renamable $sgpr1 = S_BUFFER_LOAD_DWORD_IMM renamable $sgpr4_sgpr5_sgpr6_sgpr7, 4, 0 :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: renamable $sgpr0 = S_BUFFER_LOAD_DWORD_IMM renamable $sgpr4_sgpr5_sgpr6_sgpr7, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
- ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr0 = S_BUFFER_LOAD_DWORD_IMM renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
- ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr0 = S_BUFFER_LOAD_DWORD_IMM renamable $sgpr4_sgpr5_sgpr6_sgpr7, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
%0:sgpr_128 = COPY $sgpr8_sgpr9_sgpr10_sgpr11
@@ -461,15 +411,11 @@ body: |
; GCN: liveins: $sgpr8_sgpr9
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY $sgpr8_sgpr9
- ; GCN-NEXT: renamable $sgpr0 = S_SCRATCH_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 0, 0, implicit $flat_scr :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: renamable $sgpr1 = S_SCRATCH_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 4, 0, implicit $flat_scr :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: renamable $sgpr0 = S_SCRATCH_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0, implicit $flat_scr :: (dereferenceable invariant load (s32), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
- ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr0 = S_SCRATCH_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 0, 0, implicit $flat_scr :: (dereferenceable invariant load (s32), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
- ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr0 = S_SCRATCH_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0, implicit $flat_scr :: (dereferenceable invariant load (s32), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -492,15 +438,11 @@ body: |
; GCN: liveins: $sgpr8_sgpr9
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY $sgpr8_sgpr9
- ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 0, 0 :: (invariant load (s32), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 4, 0 :: (invariant load (s32), addrspace 4)
- ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0 :: (invariant load (s32), addrspace 4)
- ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
- ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 0, 0 :: (invariant load (s32), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
- ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0 :: (invariant load (s32), addrspace 4)
; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3
%0:sreg_64_xexec = COPY $sgpr8_sgpr9
diff --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
index 55639a27fd5c746..355829825146dfe 100644
--- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
+++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
@@ -33,7 +33,6 @@ body: |
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $vgpr1 = IMPLICIT_DEF
; CHECK-NEXT: renamable $sgpr34_sgpr35 = IMPLICIT_DEF
; CHECK-NEXT: dead renamable $vgpr0 = IMPLICIT_DEF
; CHECK-NEXT: renamable $sgpr41 = IMPLICIT_DEF
@@ -41,16 +40,6 @@ body: |
; CHECK-NEXT: renamable $sgpr36_sgpr37 = IMPLICIT_DEF
; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 0, 0 :: (dereferenceable invariant load (s256), align 16, addrspace 4)
; CHECK-NEXT: dead renamable $sgpr4 = S_LOAD_DWORD_IMM renamable $sgpr38_sgpr39, 48, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
- ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4)
- ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr1, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
- ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr5, 1, killed $vgpr1
- ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr6, 2, killed $vgpr1
- ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr7, 3, killed $vgpr1
- ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr8, 4, killed $vgpr1
- ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr9, 5, killed $vgpr1
- ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr10, 6, killed $vgpr1
- ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 killed $sgpr11, 7, killed $vgpr1, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
- ; CHECK-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr1, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM renamable $sgpr44_sgpr45, 0, 0 :: (invariant load (s64), align 16, addrspace 4)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; CHECK-NEXT: $vgpr1 = COPY renamable $sgpr51
@@ -63,50 +52,30 @@ body: |
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
- ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr1, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
- ; CHECK-NEXT: $sgpr5 = V_READLANE_B32 $vgpr1, 1
- ; CHECK-NEXT: $sgpr6 = V_READLANE_B32 $vgpr1, 2
- ; CHECK-NEXT: $sgpr7 = V_READLANE_B32 $vgpr1, 3
- ; CHECK-NEXT: $sgpr8 = V_READLANE_B32 $vgpr1, 4
- ; CHECK-NEXT: $sgpr9 = V_READLANE_B32 $vgpr1, 5
- ; CHECK-NEXT: $sgpr10 = V_READLANE_B32 $vgpr1, 6
- ; CHECK-NEXT: $sgpr11 = V_READLANE_B32 $vgpr1, 7
- ; CHECK-NEXT: $noreg = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: $exec = S_MOV_B64 killed $noreg
+ ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4)
; CHECK-NEXT: S_BRANCH %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
- ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr1, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
- ; CHECK-NEXT: $sgpr5 = V_READLANE_B32 $vgpr1, 1
- ; CHECK-NEXT: $sgpr6 = V_READLANE_B32 $vgpr1, 2
- ; CHECK-NEXT: $sgpr7 = V_READLANE_B32 $vgpr1, 3
- ; CHECK-NEXT: $sgpr8 = V_READLANE_B32 $vgpr1, 4
- ; CHECK-NEXT: $sgpr9 = V_READLANE_B32 $vgpr1, 5
- ; CHECK-NEXT: $sgpr10 = V_READLANE_B32 $vgpr1, 6
- ; CHECK-NEXT: $sgpr11 = V_READLANE_B32 $vgpr1, 7
+ ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4)
; CHECK-NEXT: S_CMP_LG_U64 renamable $sgpr4_sgpr5, 0, implicit-def $scc
- ; CHECK-NEXT: $noreg = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
- ; CHECK-NEXT: $exec = S_MOV_B64 killed $noreg
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.5(0x40000000), %bb.4(0x40000000)
- ; CHECK-NEXT: liveins: $vgpr1, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+ ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_CBRANCH_VCCZ %bb.5, implicit undef $vcc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.4:
; CHECK-NEXT: successors: %bb.5(0x80000000)
- ; CHECK-NEXT: liveins: $vgpr1, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+ ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_CMP_EQ_U32 renamable $sgpr8, 0, implicit-def $scc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.5:
- ; CHECK-NEXT: liveins: $vgpr1, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+ ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr38_sgpr39, 40, 0 :: (dereferenceable invariant load (s64), addrspace 4)
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR undef renamable $vgpr0, undef renamable $vgpr0, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s32), addrspace 1)
@@ -116,7 +85,6 @@ body: |
; CHECK-NEXT: $sgpr6_sgpr7 = COPY killed renamable $sgpr36_sgpr37
; CHECK-NEXT: $sgpr10_sgpr11 = COPY killed renamable $sgpr34_sgpr35
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
- ; CHECK-NEXT: KILL killed renamable $vgpr1
; CHECK-NEXT: S_ENDPGM 0
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr14, $sgpr15, $sgpr16
>From b684eb7e72157a24660cce36b206af0ac4659869 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Fri, 13 Oct 2023 12:09:53 +0200
Subject: [PATCH 2/5] Fixup - addressing review comments
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 20 +++++++++-----------
1 file changed, 9 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 31a086cd2a61820..93380ae56be1220 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2488,17 +2488,16 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
}
if (SingleUseIdx == -1)
break;
- MachineOperand *UseMO = &I->getOperand(SingleUseIdx);
- if (UseMO->getSubReg() == AMDGPU::NoSubRegister)
+ MachineOperand &UseMO = I->getOperand(SingleUseIdx);
+ if (UseMO.getSubReg() == AMDGPU::NoSubRegister)
break;
- unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
- unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
+ unsigned Offset = RI.getSubRegIdxOffset(UseMO.getSubReg());
+ unsigned SubregSize = RI.getSubRegIdxSize(UseMO.getSubReg());
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- assert(MRI.hasAtMostUserInstrs(DestReg, 0) &&
- "DestReg should have no users yet.");
+ assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
unsigned NewOpcode = -1;
if (SubregSize == 256)
@@ -2513,8 +2512,8 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
MRI.setRegClass(DestReg, NewRC);
- UseMO->setReg(DestReg);
- UseMO->setSubReg(AMDGPU::NoSubRegister);
+ UseMO.setReg(DestReg);
+ UseMO.setSubReg(AMDGPU::NoSubRegister);
// Use a smaller load with the desired size, possibly with updated offset.
MachineInstr *MI = MF->CloneMachineInstr(&Orig);
@@ -2539,9 +2538,8 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
default:
break;
}
- MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
- MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, RI);
- MBB.insert(I, MI);
+
+ TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
}
std::pair<MachineInstr*, MachineInstr*>
>From 3b3f987ad234aa6da9f8b3b03e0ddfdd78566278 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Mon, 16 Oct 2023 16:33:51 +0200
Subject: [PATCH 3/5] Fixup - addressing review comments
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 93380ae56be1220..d91dbf578a7fef4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2477,7 +2477,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
if (!CandReg)
continue;
- if (CandReg == RegToFind || RI.regsOverlap(CandReg, RegToFind)) {
+ if (CandReg == RegToFind) {
if (SingleUseIdx == -1 && CandMO.isUse()) {
SingleUseIdx = i;
} else {
>From 4f66c207d004c1e3483922c19f71fdd812996541 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Mon, 16 Oct 2023 17:36:09 +0200
Subject: [PATCH 4/5] Fixup - addressing review comments
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 35 +++++++++-----------------
1 file changed, 12 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d91dbf578a7fef4..a742eccb15e2bd2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2468,32 +2468,21 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
// Look for a single use of the register that is also a subreg.
Register RegToFind = Orig.getOperand(0).getReg();
- int SingleUseIdx = -1;
- for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
- const MachineOperand &CandMO = I->getOperand(i);
- if (!CandMO.isReg())
+ MachineOperand *UseMO = nullptr;
+ for (auto &CandMO : I->operands()) {
+ if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
continue;
- Register CandReg = CandMO.getReg();
- if (!CandReg)
- continue;
-
- if (CandReg == RegToFind) {
- if (SingleUseIdx == -1 && CandMO.isUse()) {
- SingleUseIdx = i;
- } else {
- SingleUseIdx = -1;
- break;
- }
+ if (UseMO) {
+ UseMO = nullptr;
+ break;
}
+ UseMO = &CandMO;
}
- if (SingleUseIdx == -1)
- break;
- MachineOperand &UseMO = I->getOperand(SingleUseIdx);
- if (UseMO.getSubReg() == AMDGPU::NoSubRegister)
+ if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
break;
- unsigned Offset = RI.getSubRegIdxOffset(UseMO.getSubReg());
- unsigned SubregSize = RI.getSubRegIdxSize(UseMO.getSubReg());
+ unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
+ unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -2512,8 +2501,8 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
MRI.setRegClass(DestReg, NewRC);
- UseMO.setReg(DestReg);
- UseMO.setSubReg(AMDGPU::NoSubRegister);
+ UseMO->setReg(DestReg);
+ UseMO->setSubReg(AMDGPU::NoSubRegister);
// Use a smaller load with the desired size, possibly with updated offset.
MachineInstr *MI = MF->CloneMachineInstr(&Orig);
>From 51415cd507c2f08cbd9e03bb35e8af858f0bd3e0 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Wed, 25 Oct 2023 12:32:57 +0200
Subject: [PATCH 5/5] Fixup - addressing review comments
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 29 ++++++++++++++++----------
1 file changed, 18 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index a742eccb15e2bd2..54fde2bd8f57b8f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -106,20 +106,27 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
}
-bool SIInstrInfo::isReallyTriviallyReMaterializable(
- const MachineInstr &MI) const {
+static bool canRemat(const MachineInstr &MI) {
- bool CanRemat = false;
- if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) {
- CanRemat = true;
- } else if (isSMRD(MI)) {
- CanRemat = !MI.memoperands_empty() &&
- llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
- return MMO->isLoad() && MMO->isInvariant();
- });
+ if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) ||
+ SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) ||
+ SIInstrInfo::isSALU(MI))
+ return true;
+
+ if (SIInstrInfo::isSMRD(MI)) {
+ return !MI.memoperands_empty() &&
+ llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
+ return MMO->isLoad() && MMO->isInvariant();
+ });
}
- if (CanRemat) {
+ return false;
+}
+
+bool SIInstrInfo::isReallyTriviallyReMaterializable(
+ const MachineInstr &MI) const {
+
+ if (canRemat(MI)) {
// Normally VALU use of exec would block the rematerialization, but that
// is OK in this case to have an implicit exec read as all VALU do.
// We really want all of the generic logic for this except for this.
More information about the cfe-commits
mailing list