[llvm] 5789993 - [AMDGPU] Make GCNRegBankReassign assign based on subreg banks
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 3 20:55:46 PDT 2020
Author: Carl Ritson
Date: 2020-08-04T12:54:44+09:00
New Revision: 57899934eab18bbcab3482cc3ef862b0a1617ad0
URL: https://github.com/llvm/llvm-project/commit/57899934eab18bbcab3482cc3ef862b0a1617ad0
DIFF: https://github.com/llvm/llvm-project/commit/57899934eab18bbcab3482cc3ef862b0a1617ad0.diff
LOG: [AMDGPU] Make GCNRegBankReassign assign based on subreg banks
When scavenging consider the sub-register of the source operand
to determine the bank of a candidate register (not just sub0).
Without this it is possible to introduce an infinite loop,
e.g. $sgpr15_sgpr16_sgpr17 can be assigned for a conflict between
$sgpr0 and SGPR_96:sub1.
Reviewed By: rampitec
Differential Revision: https://reviews.llvm.org/D84910
Added:
llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir
Modified:
llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
index 98d971630ca4..79b33e24c830 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
@@ -83,9 +83,10 @@ class GCNRegBankReassign : public MachineFunctionPass {
class Candidate {
public:
- Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks,
- unsigned weight)
- : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {}
+ Candidate(MachineInstr *mi, unsigned reg, unsigned subreg,
+ unsigned freebanks, unsigned weight)
+ : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks),
+ Weight(weight) {}
bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
@@ -100,6 +101,7 @@ class GCNRegBankReassign : public MachineFunctionPass {
MachineInstr *MI;
unsigned Reg;
+ unsigned SubReg;
unsigned FreeBanks;
unsigned Weight;
};
@@ -162,7 +164,7 @@ class GCNRegBankReassign : public MachineFunctionPass {
const MCPhysReg *CSRegs;
// Returns bank for a phys reg.
- unsigned getPhysRegBank(unsigned Reg) const;
+ unsigned getPhysRegBank(unsigned Reg, unsigned SubReg) const;
// Return a bit set for each register bank used. 4 banks for VGPRs and
// 8 banks for SGPRs.
@@ -176,7 +178,7 @@ class GCNRegBankReassign : public MachineFunctionPass {
// a register chosen from Bank.
std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI,
unsigned Reg = AMDGPU::NoRegister,
- int Bank = -1);
+ unsigned SubReg = 0, int Bank = -1);
// Return true if register is regular VGPR or SGPR or their tuples.
// Returns false for special registers like m0, vcc etc.
@@ -216,11 +218,12 @@ class GCNRegBankReassign : public MachineFunctionPass {
// candidates are collected and added to work list.
unsigned computeStallCycles(unsigned SrcReg,
unsigned Reg = AMDGPU::NoRegister,
- int Bank = -1, bool Collect = false);
+ unsigned SubReg = 0, int Bank = -1,
+ bool Collect = false);
// Search for a register in Bank unused within LI.
// Returns phys reg or NoRegister.
- unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const;
+ unsigned scavengeReg(LiveInterval &LI, unsigned Bank, unsigned SubReg) const;
// Try to reassign candidate. Returns number or stall cycles saved.
unsigned tryReassign(Candidate &C);
@@ -277,15 +280,24 @@ char GCNRegBankReassign::ID = 0;
char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
-unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
+unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg,
+ unsigned SubReg) const {
assert(Register::isPhysicalRegister(Reg));
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
unsigned Size = TRI->getRegSizeInBits(*RC);
if (Size == 16)
Reg = TRI->get32BitRegister(Reg);
- else if (Size > 32)
- Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+ else if (Size > 32) {
+ if (SubReg) {
+ const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg);
+ Reg = TRI->getSubReg(Reg, SubReg);
+ if (TRI->getRegSizeInBits(*SubRC) > 32)
+ Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+ } else {
+ Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+ }
+ }
if (TRI->hasVGPRs(RC)) {
Reg -= AMDGPU::VGPR0;
@@ -360,7 +372,7 @@ uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
std::pair<unsigned, unsigned>
GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg,
- int Bank) {
+ unsigned SubReg, int Bank) {
unsigned StallCycles = 0;
unsigned UsedBanks = 0;
@@ -375,26 +387,39 @@ GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg,
if (!Op.isReg() || Op.isUndef())
continue;
- Register R = Op.getReg();
- if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
- continue;
+ const Register R = Op.getReg();
+ const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, R);
- unsigned ShiftedBank = Bank;
+ // Do not compute stalls for AGPRs
+ if (TRI->hasAGPRs(RC))
+ continue;
- if (Bank != -1 && R == Reg && Op.getSubReg()) {
- unsigned Offset = TRI->getChannelFromSubReg(Op.getSubReg());
+ // Do not compute stalls if sub-register covers all banks
+ if (Op.getSubReg()) {
LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg());
- if (Offset && Bank < NUM_VGPR_BANKS) {
- // If a register spans all banks we cannot shift it to avoid conflict.
+ if (TRI->hasVGPRs(RC)) {
if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS)
continue;
- ShiftedBank = (Bank + Offset) % NUM_VGPR_BANKS;
- } else if (Offset > 1 && Bank >= SGPR_BANK_OFFSET) {
- // If a register spans all banks we cannot shift it to avoid conflict.
+ } else {
if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS)
continue;
+ }
+ }
+
+ unsigned ShiftedBank = Bank;
+
+ if (Bank != -1 && R == Reg && (Op.getSubReg() || SubReg)) {
+ unsigned RegOffset =
+ TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0);
+ unsigned Offset = TRI->getChannelFromSubReg(
+ Op.getSubReg() ? Op.getSubReg() : (unsigned)AMDGPU::sub0);
+ if (Bank < NUM_VGPR_BANKS) {
+ unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset);
+ ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS;
+ } else if (Bank >= SGPR_BANK_OFFSET) {
+ unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1);
ShiftedBank = SGPR_BANK_OFFSET +
- (Bank - SGPR_BANK_OFFSET + (Offset >> 1)) % NUM_SGPR_BANKS;
+ (Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS;
}
}
@@ -576,17 +601,17 @@ void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
if (FreeBanks1)
- Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight
- + ((Size2 > Size1) ? 1 : 0)));
+ Candidates.push(Candidate(&MI, Reg1, SubReg1, FreeBanks1,
+ Weight + ((Size2 > Size1) ? 1 : 0)));
if (FreeBanks2)
- Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight
- + ((Size1 > Size2) ? 1 : 0)));
+ Candidates.push(Candidate(&MI, Reg2, SubReg2, FreeBanks2,
+ Weight + ((Size1 > Size2) ? 1 : 0)));
}
}
}
-unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
- unsigned Reg, int Bank,
+unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, unsigned Reg,
+ unsigned SubReg, int Bank,
bool Collect) {
unsigned TotalStallCycles = 0;
SmallSet<const MachineInstr *, 16> Visited;
@@ -598,7 +623,7 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
continue;
unsigned StallCycles;
unsigned UsedBanks;
- std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, Bank);
+ std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank);
TotalStallCycles += StallCycles;
if (Collect)
collectCandidates(MI, UsedBanks, StallCycles);
@@ -607,8 +632,8 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
return TotalStallCycles;
}
-unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
- unsigned Bank) const {
+unsigned GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank,
+ unsigned SubReg) const {
const TargetRegisterClass *RC = MRI->getRegClass(LI.reg);
unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
: MaxNumSGPRs;
@@ -620,7 +645,7 @@ unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
if (TRI->isSubRegisterEq(Reg, MaxReg))
break;
- if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank)
+ if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank)
continue;
for (unsigned I = 0; CSRegs[I]; ++I)
@@ -669,7 +694,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
if (C.FreeBanks & (1 << Bank)) {
LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
- unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank);
+ unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank);
if (Stalls < OrigStalls) {
LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
<< Stalls << '\n');
@@ -683,7 +708,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
LRM->unassign(LI);
while (!BankStalls.empty()) {
BankStall BS = BankStalls.pop_back_val();
- unsigned Reg = scavengeReg(LI, BS.Bank);
+ unsigned Reg = scavengeReg(LI, BS.Bank, C.SubReg);
if (Reg == AMDGPU::NoRegister) {
LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
<< '\n');
@@ -801,7 +826,7 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
Candidates.pop_back();
if (LocalCyclesSaved) {
removeCandidates(C.Reg);
- computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true);
+ computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true);
Candidates.sort();
LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 8e4a071701b3..b4afc48f98ba 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -1492,7 +1492,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %
; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v18
; MOVREL-NEXT: v_mov_b32_e32 v19, v0
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18
-; MOVREL-NEXT: v_mov_b32_e32 v20, v1
+; MOVREL-NEXT: v_mov_b32_e32 v23, v1
; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v18
; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 3, v18
; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 7, v18
@@ -1501,7 +1501,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %
; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v16, s3
; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4
-; MOVREL-NEXT: v_cndmask_b32_e32 v1, v20, v17, vcc_lo
+; MOVREL-NEXT: v_cndmask_b32_e32 v1, v23, v17, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v17, s3
; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v17, s4
@@ -2123,7 +2123,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: v_add_nc_u32_e32 v18, 1, v18
; MOVREL-NEXT: v_mov_b32_e32 v19, v0
-; MOVREL-NEXT: v_mov_b32_e32 v20, v1
+; MOVREL-NEXT: v_mov_b32_e32 v23, v1
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v18
@@ -2137,7 +2137,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do
; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v16, s3
; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4
-; MOVREL-NEXT: v_cndmask_b32_e32 v1, v20, v17, vcc_lo
+; MOVREL-NEXT: v_cndmask_b32_e32 v1, v23, v17, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v17, s3
; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v17, s4
@@ -4111,7 +4111,7 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec,
; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 4, v16
; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v16
; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 6, v16
-; MOVREL-NEXT: v_mov_b32_e32 v17, v2
+; MOVREL-NEXT: v_mov_b32_e32 v19, v2
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v16
; MOVREL-NEXT: v_mov_b32_e32 v18, v3
; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v14, s2
@@ -4119,7 +4119,7 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec,
; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v14, s4
; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, v14, s5
; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, v15, s2
-; MOVREL-NEXT: v_cndmask_b32_e64 v2, v17, v14, s0
+; MOVREL-NEXT: v_cndmask_b32_e64 v2, v19, v14, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v15, s3
; MOVREL-NEXT: v_cndmask_b32_e64 v3, v18, v15, s0
; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
@@ -4251,42 +4251,42 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg
; MOVREL-NEXT: s_mov_b32 s7, s9
; MOVREL-NEXT: s_mov_b32 s8, s10
; MOVREL-NEXT: s_mov_b32 s9, s11
-; MOVREL-NEXT: v_mov_b32_e32 v18, s15
-; MOVREL-NEXT: v_mov_b32_e32 v17, s14
-; MOVREL-NEXT: v_mov_b32_e32 v16, s13
-; MOVREL-NEXT: v_mov_b32_e32 v15, s12
-; MOVREL-NEXT: v_mov_b32_e32 v14, s11
-; MOVREL-NEXT: v_mov_b32_e32 v13, s10
-; MOVREL-NEXT: v_mov_b32_e32 v12, s9
-; MOVREL-NEXT: v_mov_b32_e32 v11, s8
-; MOVREL-NEXT: v_mov_b32_e32 v10, s7
-; MOVREL-NEXT: v_mov_b32_e32 v9, s6
-; MOVREL-NEXT: v_mov_b32_e32 v8, s5
-; MOVREL-NEXT: v_mov_b32_e32 v7, s4
-; MOVREL-NEXT: v_mov_b32_e32 v6, s3
-; MOVREL-NEXT: v_mov_b32_e32 v5, s2
-; MOVREL-NEXT: v_mov_b32_e32 v4, s1
-; MOVREL-NEXT: v_mov_b32_e32 v3, s0
+; MOVREL-NEXT: v_mov_b32_e32 v20, s15
+; MOVREL-NEXT: v_mov_b32_e32 v19, s14
+; MOVREL-NEXT: v_mov_b32_e32 v18, s13
+; MOVREL-NEXT: v_mov_b32_e32 v17, s12
+; MOVREL-NEXT: v_mov_b32_e32 v16, s11
+; MOVREL-NEXT: v_mov_b32_e32 v15, s10
+; MOVREL-NEXT: v_mov_b32_e32 v14, s9
+; MOVREL-NEXT: v_mov_b32_e32 v13, s8
+; MOVREL-NEXT: v_mov_b32_e32 v12, s7
+; MOVREL-NEXT: v_mov_b32_e32 v11, s6
+; MOVREL-NEXT: v_mov_b32_e32 v10, s5
+; MOVREL-NEXT: v_mov_b32_e32 v9, s4
+; MOVREL-NEXT: v_mov_b32_e32 v8, s3
+; MOVREL-NEXT: v_mov_b32_e32 v7, s2
+; MOVREL-NEXT: v_mov_b32_e32 v6, s1
+; MOVREL-NEXT: v_mov_b32_e32 v5, s0
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 1
; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, s12, 4
; MOVREL-NEXT: ; implicit-def: $vcc_hi
-; MOVREL-NEXT: v_cndmask_b32_e32 v2, v3, v0, vcc_lo
-; MOVREL-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc_lo
-; MOVREL-NEXT: v_cndmask_b32_e64 v4, v5, v0, s0
-; MOVREL-NEXT: v_cndmask_b32_e64 v5, v6, v1, s0
+; MOVREL-NEXT: v_cndmask_b32_e32 v2, v5, v0, vcc_lo
+; MOVREL-NEXT: v_cndmask_b32_e32 v3, v6, v1, vcc_lo
+; MOVREL-NEXT: v_cndmask_b32_e64 v4, v7, v0, s0
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2
+; MOVREL-NEXT: v_cndmask_b32_e64 v5, v8, v1, s0
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 3
; MOVREL-NEXT: v_readfirstlane_b32 s2, v4
-; MOVREL-NEXT: v_readfirstlane_b32 s3, v5
-; MOVREL-NEXT: v_cndmask_b32_e32 v6, v7, v0, vcc_lo
-; MOVREL-NEXT: v_cndmask_b32_e32 v7, v8, v1, vcc_lo
-; MOVREL-NEXT: v_cndmask_b32_e64 v8, v9, v0, s0
-; MOVREL-NEXT: v_cndmask_b32_e64 v9, v10, v1, s0
-; MOVREL-NEXT: v_cndmask_b32_e64 v0, v11, v0, s1
-; MOVREL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s1
+; MOVREL-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc_lo
+; MOVREL-NEXT: v_cndmask_b32_e32 v7, v10, v1, vcc_lo
+; MOVREL-NEXT: v_cndmask_b32_e64 v8, v11, v0, s0
+; MOVREL-NEXT: v_cndmask_b32_e64 v9, v12, v1, s0
+; MOVREL-NEXT: v_cndmask_b32_e64 v0, v13, v0, s1
+; MOVREL-NEXT: v_cndmask_b32_e64 v1, v14, v1, s1
; MOVREL-NEXT: v_readfirstlane_b32 s0, v2
; MOVREL-NEXT: v_readfirstlane_b32 s1, v3
+; MOVREL-NEXT: v_readfirstlane_b32 s3, v5
; MOVREL-NEXT: v_readfirstlane_b32 s4, v6
; MOVREL-NEXT: v_readfirstlane_b32 s5, v7
; MOVREL-NEXT: v_readfirstlane_b32 s6, v8
@@ -4448,7 +4448,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec,
; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_s:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
-; MOVREL-NEXT: v_mov_b32_e32 v13, v2
+; MOVREL-NEXT: v_mov_b32_e32 v15, v2
; MOVREL-NEXT: v_mov_b32_e32 v14, v3
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
@@ -4457,7 +4457,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec,
; MOVREL-NEXT: v_readfirstlane_b32 s0, v0
; MOVREL-NEXT: v_readfirstlane_b32 s1, v1
; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo
-; MOVREL-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc_lo
+; MOVREL-NEXT: v_cndmask_b32_e32 v2, v15, v10, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2
; MOVREL-NEXT: v_readfirstlane_b32 s3, v3
; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
@@ -4514,7 +4514,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec,
; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_v:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12
-; MOVREL-NEXT: v_mov_b32_e32 v13, v2
+; MOVREL-NEXT: v_mov_b32_e32 v15, v2
; MOVREL-NEXT: v_mov_b32_e32 v14, v3
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
@@ -4522,7 +4522,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec,
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
; MOVREL-NEXT: v_readfirstlane_b32 s0, v0
; MOVREL-NEXT: v_readfirstlane_b32 s1, v1
-; MOVREL-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc_lo
+; MOVREL-NEXT: v_cndmask_b32_e32 v2, v15, v10, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12
; MOVREL-NEXT: v_readfirstlane_b32 s2, v2
diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir
new file mode 100644
index 000000000000..49d6a9ad1971
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir
@@ -0,0 +1,69 @@
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s
+
+
+# Test that subreg reassignments are correctly handled when whole register also
+# conflicts. If this is mishandled stall counts will be incorrect and cause an
+# infinite loop.
+# GCN-LABEL: vgpr64_mixed_use{{$}}
+# GCN: $vgpr0_vgpr1 = IMPLICIT_DEF
+# GCN: $vgpr4_vgpr5 = IMPLICIT_DEF
+# GCN: $vcc = IMPLICIT_DEF
+# GCN: $vgpr2_vgpr3 = IMPLICIT_DEF
+# GCN: $vgpr6_vgpr7 = IMPLICIT_DEF
+# GCN: $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
+# GCN: $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+# GCN: $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+# GCN: $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
+# GCN: $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+# GCN: $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+# GCN: $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
+# GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
+# GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
+# GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
+# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr5, $vcc, implicit $exec
+# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr4, killed $vcc, implicit $exec
+# GCN: $sgpr0_sgpr1 = V_CMP_LT_U64_e64 $vgpr4_vgpr5, $vgpr0_vgpr1, implicit $exec
+---
+name: vgpr64_mixed_use
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vreg_64, preferred-register: '$vgpr0_vgpr1' }
+ - { id: 1, class: vreg_64, preferred-register: '$vgpr4_vgpr5' }
+ - { id: 2, class: sreg_64_xexec, preferred-register: '$vcc' }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: vgpr_32 }
+ - { id: 5, class: sreg_64_xexec }
+ - { id: 6, class: vreg_64, preferred-register: '$vgpr2_vgpr3' }
+ - { id: 7, class: vreg_64, preferred-register: '$vgpr6_vgpr7' }
+ - { id: 8, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' }
+ - { id: 9, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' }
+ - { id: 10, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' }
+ - { id: 11, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' }
+ - { id: 12, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' }
+ - { id: 13, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' }
+ - { id: 14, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' }
+ - { id: 15, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' }
+ - { id: 16, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' }
+ - { id: 17, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' }
+body: |
+ bb.0:
+ %0 = IMPLICIT_DEF
+ %1 = IMPLICIT_DEF
+ %2 = IMPLICIT_DEF
+ %6 = IMPLICIT_DEF
+ %7 = IMPLICIT_DEF
+ %8 = IMPLICIT_DEF
+ %9 = IMPLICIT_DEF
+ %10 = IMPLICIT_DEF
+ %11 = IMPLICIT_DEF
+ %12 = IMPLICIT_DEF
+ %13 = IMPLICIT_DEF
+ %14 = IMPLICIT_DEF
+ %15 = IMPLICIT_DEF
+ %16 = IMPLICIT_DEF
+ %17 = IMPLICIT_DEF
+ %3 = V_CNDMASK_B32_e64 0, %0.sub1, 0, %1.sub1, %2, implicit $exec
+ %4 = V_CNDMASK_B32_e64 0, %0.sub0, 0, %1.sub0, %2, implicit $exec
+ %5 = V_CMP_LT_U64_e64 %1, %0, implicit $exec
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
index 0020e17a0b6f..2078d8c22922 100644
--- a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
+++ b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
@@ -494,3 +494,81 @@ body: |
%2 = V_AND_B32_e32 %1, %0, implicit $exec
S_ENDPGM 0
...
+
+# Test that bank of subreg is considered during scavenging.
+# If handled incorrectly an infinite loop occurs.
+# GCN-LABEL: s0_vs_s15_16_17_sub1{{$}}
+# GCN: S_AND_B32 renamable $sgpr13, $sgpr0,
+---
+name: s0_vs_s15_16_17_sub1
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_96, preferred-register: '$sgpr15_sgpr16_sgpr17' }
+ - { id: 1, class: sgpr_32 }
+body: |
+ bb.0:
+ %0 = IMPLICIT_DEF
+ $sgpr0 = IMPLICIT_DEF
+ %1 = S_AND_B32 %0.sub1, $sgpr0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+# Test that the size of subreg is correctly handled in bank calculation.
+# If handled incorrectly an infinite loop occurs.
+# GCN-LABEL: vgpr_sub_dependence{{$}}
+# GCN: $vgpr9_vgpr10_vgpr11_vgpr12 = IMPLICIT_DEF
+# GCN: $vgpr16_vgpr17 = IMPLICIT_DEF
+# GCN: $vgpr14_vgpr15 = IMPLICIT_DEF
+# GCN: $vgpr0_vgpr1 = IMPLICIT_DEF
+# GCN: $vgpr7_vgpr8 = IMPLICIT_DEF
+# GCN: $vgpr3_vgpr4_vgpr5_vgpr6 = IMPLICIT_DEF
+# GCN: $vgpr18_vgpr19 = IMPLICIT_DEF
+# GCN: $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
+# GCN: $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+# GCN: $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+# GCN: $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
+# GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
+# GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
+# GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
+# GCN: $vgpr0_vgpr1 = V_ADD_F64 0, $vgpr11_vgpr12, 0, killed $vgpr16_vgpr17, 0, 0, implicit $mode, implicit $exec
+# GCN: $vgpr0_vgpr1 = V_ADD_F64 0, $vgpr9_vgpr10, 0, killed $vgpr14_vgpr15, 0, 0, implicit $mode, implicit $exec
+---
+name: vgpr_sub_dependence
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vreg_128, preferred-register: '$vgpr10_vgpr11_vgpr12_vgpr13' }
+ - { id: 1, class: vreg_64, preferred-register: '$vgpr16_vgpr17' }
+ - { id: 2, class: vreg_64, preferred-register: '$vgpr14_vgpr15' }
+ - { id: 3, class: vreg_64 }
+ - { id: 4, class: vreg_64 }
+ - { id: 5, class: vreg_64, preferred-register: '$vgpr0_vgpr1' }
+ - { id: 6, class: vreg_64, preferred-register: '$vgpr7_vgpr8' }
+ - { id: 7, class: vreg_128, preferred-register: '$vgpr3_vgpr4_vgpr5_vgpr6' }
+ - { id: 8, class: vreg_64, preferred-register: '$vgpr18_vgpr19' }
+ - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' }
+ - { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' }
+ - { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' }
+ - { id: 12, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' }
+ - { id: 13, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' }
+ - { id: 14, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' }
+ - { id: 15, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' }
+body: |
+ bb.0:
+ %0 = IMPLICIT_DEF
+ %1 = IMPLICIT_DEF
+ %2 = IMPLICIT_DEF
+ %5 = IMPLICIT_DEF
+ %6 = IMPLICIT_DEF
+ %7 = IMPLICIT_DEF
+ %8 = IMPLICIT_DEF
+ %9 = IMPLICIT_DEF
+ %10 = IMPLICIT_DEF
+ %11 = IMPLICIT_DEF
+ %12 = IMPLICIT_DEF
+ %13 = IMPLICIT_DEF
+ %14 = IMPLICIT_DEF
+ %15 = IMPLICIT_DEF
+ %3 = V_ADD_F64 0, %0.sub2_sub3:vreg_128, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec
+ %4 = V_ADD_F64 0, %0.sub0_sub1:vreg_128, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
More information about the llvm-commits
mailing list