[llvm] 58adab3 - [AMDGPU] Resolve pseudo registers at encoding uses
Joe Nash via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 4 09:56:09 PST 2020
Author: Joe Nash
Date: 2020-11-04T12:52:32-05:00
New Revision: 58adab34c4801e835bf1a55efbd1213b201660fb
URL: https://github.com/llvm/llvm-project/commit/58adab34c4801e835bf1a55efbd1213b201660fb
DIFF: https://github.com/llvm/llvm-project/commit/58adab34c4801e835bf1a55efbd1213b201660fb.diff
LOG: [AMDGPU] Resolve pseudo registers at encoding uses
Pseudo-registers allow different register encodings
between gpu generations. Make sure we resolve the
pseudo regs to real regs whenever we get their
hardware encoding.
Using the correct encodings revealed a register
bank conflict and an unnecessary write dependency.
Tests have been updated to match.
Reviewed By: rampitec
Differential Revision: https://reviews.llvm.org/D90721
Change-Id: I73c154cd24aecc820993b50bebaf4df97a5710ca
Added:
Modified:
llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/lib/Target/AMDGPU/SIRegisterInfo.h
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
llvm/test/CodeGen/AMDGPU/flat-scratch.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 63ae49d5f67b..b4af9de44cc4 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -2994,7 +2994,8 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
} else if (MO.isReg()) {
auto Reg = MO.getReg();
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
- return isSGPR(mc2PseudoReg(Reg), TRI) && Reg != SGPR_NULL;
+ auto PReg = mc2PseudoReg(Reg);
+ return isSGPR(PReg, TRI) && PReg != SGPR_NULL;
} else {
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
index 9a27b23ce419..26cd308a2ef7 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
@@ -35,6 +35,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/LiveInterval.h"
@@ -321,7 +322,7 @@ unsigned GCNRegBankReassign::getPhysRegBank(Register Reg,
return RegNo % NUM_VGPR_BANKS;
}
- unsigned RegNo = TRI->getEncodingValue(Reg) / 2;
+ unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
return RegNo % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
}
@@ -366,7 +367,7 @@ uint32_t GCNRegBankReassign::getRegBankMask(Register Reg, unsigned SubReg,
}
// SGPRs have 8 banks holding 2 consequitive registers each.
- unsigned RegNo = TRI->getEncodingValue(Reg) / 2;
+ unsigned RegNo = TRI->getEncodingValue(AMDGPU::getMCReg(Reg, *ST)) / 2;
unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
if (RegNo + StartBit >= RegsUsed.size())
return 0;
@@ -818,9 +819,10 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
CSRegs = MRI->getCalleeSavedRegs();
-
- RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() +
- TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1);
+ unsigned NumRegBanks = AMDGPU::VGPR_32RegClass.getNumRegs() +
+ // Not a tight bound
+ AMDGPU::SReg_32RegClass.getNumRegs() / 2 + 1;
+ RegsUsed.resize(NumRegBanks);
LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
<< '\n');
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b614cb9d3052..69c1969b8957 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -487,7 +487,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
RegInterval Result;
- unsigned Reg = TRI->getEncodingValue(Op.getReg());
+ unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
if (TRI->isVGPR(*MRI, Op.getReg())) {
assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
@@ -624,8 +624,9 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
MachineOperand &DefMO = Inst.getOperand(I);
if (DefMO.isReg() && DefMO.isDef() &&
TRI->isVGPR(*MRI, DefMO.getReg())) {
- setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
- CurrScore);
+ setRegScore(
+ TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
+ EXP_CNT, CurrScore);
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index a750a055f4be..074c57346492 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -132,6 +132,7 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
StringRef getRegAsmName(MCRegister Reg) const override;
+ // Pseudo regs are not allowed
unsigned getHWRegIndex(MCRegister Reg) const {
return getEncodingValue(Reg) & 0xff;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
index 775c1afbf0f6..826e833abc38 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
@@ -29,7 +29,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
;
; GFX10NSA-LABEL: gather4_2d:
; GFX10NSA: ; %bb.0: ; %main_body
-; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
+; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -45,7 +45,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10NSA-NEXT: s_mov_b32 s10, s12
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, 0xffff, v1
-; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
+; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: ; implicit-def: $vcc_hi
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@@ -84,7 +84,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
;
; GFX10NSA-LABEL: gather4_cube:
; GFX10NSA: ; %bb.0: ; %main_body
-; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
+; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff
@@ -103,7 +103,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12
-; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
+; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
; GFX10NSA-NEXT: ; implicit-def: $vcc_hi
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@@ -142,7 +142,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
;
; GFX10NSA-LABEL: gather4_2darray:
; GFX10NSA: ; %bb.0: ; %main_body
-; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
+; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff
@@ -161,7 +161,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12
-; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
+; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10NSA-NEXT: ; implicit-def: $vcc_hi
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@@ -198,7 +198,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
;
; GFX10NSA-LABEL: gather4_c_2d:
; GFX10NSA: ; %bb.0: ; %main_body
-; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
+; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -214,7 +214,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10NSA-NEXT: s_mov_b32 s10, s12
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2
-; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
+; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: ; implicit-def: $vcc_hi
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@@ -253,7 +253,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
;
; GFX10NSA-LABEL: gather4_cl_2d:
; GFX10NSA: ; %bb.0: ; %main_body
-; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
+; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff
@@ -272,7 +272,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12
-; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
+; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: ; implicit-def: $vcc_hi
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@@ -311,7 +311,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
;
; GFX10NSA-LABEL: gather4_c_cl_2d:
; GFX10NSA: ; %bb.0: ; %main_body
-; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
+; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff
@@ -330,7 +330,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12
-; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
+; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: ; implicit-def: $vcc_hi
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@@ -367,7 +367,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
;
; GFX10NSA-LABEL: gather4_b_2d:
; GFX10NSA: ; %bb.0: ; %main_body
-; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
+; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
@@ -383,7 +383,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10NSA-NEXT: s_mov_b32 s10, s12
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2
-; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
+; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: ; implicit-def: $vcc_hi
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@@ -420,7 +420,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
;
; GFX10NSA-LABEL: gather4_c_b_2d:
; GFX10NSA: ; %bb.0: ; %main_body
-; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
+; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3
@@ -436,7 +436,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10NSA-NEXT: s_mov_b32 s10, s12
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v2, v2, 0xffff, v3
-; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
+; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: ; implicit-def: $vcc_hi
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@@ -475,7 +475,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
;
; GFX10NSA-LABEL: gather4_b_cl_2d:
; GFX10NSA: ; %bb.0: ; %main_body
-; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
+; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff
@@ -494,7 +494,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12
-; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
+; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: ; implicit-def: $vcc_hi
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@@ -533,7 +533,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
;
; GFX10NSA-LABEL: gather4_c_b_cl_2d:
; GFX10NSA: ; %bb.0: ; %main_body
-; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
+; GFX10NSA-NEXT: s_mov_b32 s28, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v5, 0xffff
@@ -552,7 +552,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v5, v3
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v3, v4, v5, s12
-; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
+; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: ; implicit-def: $vcc_hi
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index e13f29fc1a5a..0d816dca1e16 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -123,7 +123,7 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s12, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s13, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[30:31], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[28:29], vcc
; GFX1064-NEXT: s_cbranch_execz BB0_3
; GFX1064-NEXT: ; %bb.2:
; GFX1064-NEXT: s_bcnt1_i32_b64 s12, s[12:13]
@@ -131,7 +131,7 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i
; GFX1064-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc
; GFX1064-NEXT: BB0_3:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[30:31]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[28:29]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s4
@@ -355,14 +355,14 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in
; GFX1064-NEXT: s_mov_b64 exec, s[10:11]
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[30:31], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[28:29], vcc
; GFX1064-NEXT: s_cbranch_execz BB1_3
; GFX1064-NEXT: ; %bb.2:
; GFX1064-NEXT: v_mov_b32_e32 v0, s12
; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
; GFX1064-NEXT: BB1_3:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[30:31]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[28:29]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index d88c94c60838..1e2732e39136 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -131,10 +131,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX9-LABEL: store_load_sindex_kernel:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; GFX9-NEXT: v_mov_b32_e32 v0, 15
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
; GFX9-NEXT: s_and_b32 s0, s0, 15
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
@@ -469,11 +469,11 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX9-LABEL: store_load_sindex_small_offset_kernel:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; GFX9-NEXT: s_mov_b32 vcc_hi, 0
; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
; GFX9-NEXT: s_and_b32 s0, s0, 15
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
@@ -492,10 +492,10 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: scratch_load_dword v0, off, off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 15
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s1, s0, 15
; GFX10-NEXT: s_lshl_b32 s0, s0, 2
; GFX10-NEXT: s_lshl_b32 s1, s1, 2
@@ -863,11 +863,11 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX9-LABEL: store_load_sindex_large_offset_kernel:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; GFX9-NEXT: s_mov_b32 vcc_hi, 0
; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s1, s0, 2
; GFX9-NEXT: s_and_b32 s0, s0, 15
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
@@ -886,10 +886,10 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: scratch_load_dword v0, off, off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 15
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s1, s0, 15
; GFX10-NEXT: s_lshl_b32 s0, s0, 2
; GFX10-NEXT: s_lshl_b32 s1, s1, 2
@@ -1149,10 +1149,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
; GFX9-LABEL: store_load_vidx_sidx_offset:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5
; GFX9-NEXT: v_mov_b32_e32 v1, 4
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 15
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
index 2d2ac532836d..fa744b880974 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
@@ -97,11 +97,11 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32>
;
; GFX10-LABEL: image_sample_2d_f16_tfe:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: s_mov_b32 s14, exec_lo
+; GFX10-NEXT: s_mov_b32 s28, exec_lo
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
; GFX10-NEXT: v_mov_b32_e32 v0, s12
; GFX10-NEXT: v_mov_b32_e32 v1, s13
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index 3ce35c7ebf99..02f57ff00e15 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -77,7 +77,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr
;
; GFX10-LABEL: sample_1d_tfe:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe]
+; GFX10-NEXT: s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe]
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe]
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
@@ -85,7 +85,7 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
-; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87]
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87]
; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0f,0x81,0xf0,0x05,0x00,0x40,0x00]
; GFX10-NEXT: v_mov_b32_e32 v5, s12 ; encoding: [0x0c,0x02,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, s13 ; encoding: [0x0d,0x02,0x0c,0x7e]
@@ -499,7 +499,7 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr
;
; GFX10-LABEL: sample_1d_lwe:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe]
+; GFX10-NEXT: s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe]
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe]
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
@@ -507,7 +507,7 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
-; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87]
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87]
; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; encoding: [0x00,0x0f,0x82,0xf0,0x05,0x00,0x40,0x00]
; GFX10-NEXT: v_mov_b32_e32 v5, s12 ; encoding: [0x0c,0x02,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, s13 ; encoding: [0x0d,0x02,0x0c,0x7e]
More information about the llvm-commits
mailing list