[llvm] 9a9a092 - [AMDGPU] Avoid sorting stalls in regbank-reassign
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 21 11:49:55 PDT 2020
Author: Stanislav Mekhanoshin
Date: 2020-08-21T11:49:41-07:00
New Revision: 9a9a092e61d4199207e515be087648b8cc4f9053
URL: https://github.com/llvm/llvm-project/commit/9a9a092e61d4199207e515be087648b8cc4f9053
DIFF: https://github.com/llvm/llvm-project/commit/9a9a092e61d4199207e515be087648b8cc4f9053.diff
LOG: [AMDGPU] Avoid sorting stalls in regbank-reassign
This is the slowest operation in the already slow pass.
Instead of sorting just put a stall list into an ordered
map.
Differential Revision: https://reviews.llvm.org/D86253
Added:
Modified:
llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
index d66e26ce01c5..1c940428273c 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
@@ -84,18 +84,15 @@ class GCNRegBankReassign : public MachineFunctionPass {
class Candidate {
public:
Candidate(MachineInstr *mi, Register reg, unsigned subreg,
- unsigned freebanks, unsigned weight)
- : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks),
- Weight(weight) {}
-
- bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
+ unsigned freebanks)
+ : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks) {}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void dump(const GCNRegBankReassign *P) const {
MI->dump();
dbgs() << P->printReg(Reg) << " to banks ";
dumpFreeBanks(FreeBanks);
- dbgs() << " weight " << Weight << '\n';
+ dbgs() << '\n';
}
#endif
@@ -103,16 +100,35 @@ class GCNRegBankReassign : public MachineFunctionPass {
Register Reg;
unsigned SubReg;
unsigned FreeBanks;
- unsigned Weight;
};
- class CandidateList : public std::list<Candidate> {
+ class CandidateList : public std::map<unsigned, std::list<Candidate>> {
public:
- // Speedup subsequent sort.
- void push(const Candidate&& C) {
- if (C.Weight) push_back(C);
- else push_front(C);
+ void push(unsigned Weight, const Candidate&& C) {
+ operator[](Weight).push_front(C);
+ }
+
+ Candidate &back() {
+ return rbegin()->second.back();
+ }
+
+ void pop_back() {
+ rbegin()->second.pop_back();
+ if (rbegin()->second.empty())
+ erase(rbegin()->first);
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump(const GCNRegBankReassign *P) const {
+ dbgs() << "\nCandidates:\n\n";
+ for (auto &B : *this) {
+ dbgs() << " Weight " << B.first << ":\n";
+ for (auto &C : B.second)
+ C.dump(P);
+ }
+ dbgs() << "\n\n";
}
+#endif
};
public:
@@ -601,11 +617,11 @@ void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
if (FreeBanks1)
- Candidates.push(Candidate(&MI, Reg1, SubReg1, FreeBanks1,
- Weight + ((Size2 > Size1) ? 1 : 0)));
+ Candidates.push(Weight + ((Size2 > Size1) ? 1 : 0),
+ Candidate(&MI, Reg1, SubReg1, FreeBanks1));
if (FreeBanks2)
- Candidates.push(Candidate(&MI, Reg2, SubReg2, FreeBanks2,
- Weight + ((Size1 > Size2) ? 1 : 0)));
+ Candidates.push(Weight + ((Size1 > Size2) ? 1 : 0),
+ Candidate(&MI, Reg2, SubReg2, FreeBanks2));
}
}
}
@@ -761,9 +777,15 @@ unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
}
void GCNRegBankReassign::removeCandidates(Register Reg) {
- Candidates.remove_if([Reg, this](const Candidate& C) {
- return C.MI->readsRegister(Reg, TRI);
- });
+ typename CandidateList::iterator Next;
+ for (auto I = Candidates.begin(), E = Candidates.end(); I != E; I = Next) {
+ Next = std::next(I);
+ I->second.remove_if([Reg, this](const Candidate& C) {
+ return C.MI->readsRegister(Reg, TRI);
+ });
+ if (I->second.empty())
+ Candidates.erase(I);
+ }
}
bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
@@ -808,11 +830,7 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
"function " << MF.getName() << '\n');
- Candidates.sort();
-
- LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
- for (auto C : Candidates) C.dump(this);
- dbgs() << "\n\n");
+ LLVM_DEBUG(Candidates.dump(this));
unsigned CyclesSaved = 0;
while (!Candidates.empty()) {
@@ -827,12 +845,8 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
if (LocalCyclesSaved) {
removeCandidates(C.Reg);
computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true);
- Candidates.sort();
- LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
- for (auto C : Candidates)
- C.dump(this);
- dbgs() << "\n\n");
+ LLVM_DEBUG(Candidates.dump(this));
}
}
NumStallsRecovered += CyclesSaved;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
index caba19977a49..d1c49af8b598 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
@@ -21,12 +21,12 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-LABEL: sample_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_and_or_b32 v3, v2, v6, v3
-; GFX10-NEXT: v_and_or_b32 v10, v0, v6, v1
-; GFX10-NEXT: image_sample_d_g16 v[0:3], [v10, v3, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1
+; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3
+; GFX10-NEXT: image_sample_d_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -77,9 +77,9 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2
-; GFX10-NEXT: v_and_or_b32 v3, v3, v7, v4
-; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2
+; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4
+; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -107,12 +107,12 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-LABEL: sample_d_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_and_or_b32 v3, v2, v7, v9
-; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1
-; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1
+; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9
+; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -173,12 +173,12 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-LABEL: sample_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_and_or_b32 v3, v2, v6, v3
-; GFX10-NEXT: v_and_or_b32 v10, v0, v6, v1
-; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v10, v3, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1
+; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3
+; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -209,9 +209,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2
-; GFX10-NEXT: v_and_or_b32 v3, v3, v7, v4
-; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2
+; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4
+; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -239,12 +239,12 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10-LABEL: sample_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
-; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_and_or_b32 v3, v2, v7, v9
-; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1
-; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1
+; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9
+; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index b62592eb1c47..6814f5bb1843 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -2983,19 +2983,19 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v10, v4
; GFX10-NEXT: v_mov_b32_e32 v11, v5
-; GFX10-NEXT: v_mov_b32_e32 v8, v6
-; GFX10-NEXT: v_mov_b32_e32 v9, v7
+; GFX10-NEXT: v_mov_b32_e32 v15, v6
+; GFX10-NEXT: v_mov_b32_e32 v16, v7
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v10
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v11, vcc_lo
-; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v8
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v9, vcc_lo
+; GFX10-NEXT: v_add_co_u32_e64 v5, vcc_lo, v2, v15
+; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v3, v16, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[10:11]
-; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[8:9]
+; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[5:6], v[15:16]
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, -1, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, -1, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
ret <2 x i64> %result
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index a13320bea7a1..c47412b3fe97 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -511,11 +511,11 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB2_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1064-NEXT: v_mov_b32_e32 v7, s3
+; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7
+; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@@ -563,11 +563,11 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB2_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1032-NEXT: v_mov_b32_e32 v7, s3
+; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7
+; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@@ -750,11 +750,11 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB3_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1064-NEXT: v_mov_b32_e32 v7, s3
+; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7
+; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@@ -802,11 +802,11 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB3_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1032-NEXT: v_mov_b32_e32 v7, s3
+; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7
+; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@@ -989,11 +989,11 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB4_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1064-NEXT: v_mov_b32_e32 v7, s3
+; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7
+; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@@ -1041,11 +1041,11 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB4_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1032-NEXT: v_mov_b32_e32 v7, s3
+; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7
+; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@@ -2064,11 +2064,11 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB10_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1064-NEXT: v_mov_b32_e32 v7, s3
+; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v7
+; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@@ -2116,11 +2116,11 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB10_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1032-NEXT: v_mov_b32_e32 v7, s3
+; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v7
+; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@@ -2799,11 +2799,11 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB14_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1064-NEXT: v_mov_b32_e32 v7, s3
+; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v7
+; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@@ -2850,11 +2850,11 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB14_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1032-NEXT: v_mov_b32_e32 v7, s3
+; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v7
+; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@@ -3037,11 +3037,11 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB15_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1064-NEXT: v_mov_b32_e32 v7, s3
+; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v7
+; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@@ -3089,11 +3089,11 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB15_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1032-NEXT: v_mov_b32_e32 v7, s3
+; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v7
+; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@@ -3276,11 +3276,11 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB16_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1064-NEXT: v_mov_b32_e32 v7, s3
+; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v7
+; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@@ -3328,11 +3328,11 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB16_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1032-NEXT: v_mov_b32_e32 v7, s3
+; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v7
+; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@@ -3512,11 +3512,11 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB17_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1064-NEXT: v_mov_b32_e32 v7, s3
+; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v7
+; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@@ -3563,11 +3563,11 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB17_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1032-NEXT: v_mov_b32_e32 v7, s3
+; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v7
+; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@@ -3932,11 +3932,11 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB19_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1064-NEXT: v_mov_b32_e32 v7, s3
+; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v7
+; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@@ -3983,11 +3983,11 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB19_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1032-NEXT: v_mov_b32_e32 v7, s3
+; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v7
+; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@@ -4355,11 +4355,11 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB21_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1064-NEXT: v_mov_b32_e32 v7, s3
+; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v7
+; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@@ -4407,11 +4407,11 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB21_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1032-NEXT: v_mov_b32_e32 v7, s3
+; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v7
+; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@@ -4773,11 +4773,11 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB23_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1064-NEXT: v_mov_b32_e32 v7, s3
+; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v7
+; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@@ -4824,11 +4824,11 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB23_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
-; GFX1032-NEXT: v_mov_b32_e32 v7, s3
+; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32 at abs32@lo
+; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v7
+; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
index 2a132ce2ccd9..6bf89434e0bb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
@@ -676,13 +676,13 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_and_b32_e32 v5, v10, v5
; GFX10-NEXT: v_and_b32_e32 v3, v10, v3
; GFX10-NEXT: v_and_b32_e32 v1, v10, v1
-; GFX10-NEXT: v_and_b32_e32 v5, v10, v5
+; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX10-NEXT: v_lshl_or_b32 v6, v6, 16, v5
-; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v1, v3, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
+; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -730,13 +730,13 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_and_b32_e32 v4, v7, v4
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
-; GFX10-NEXT: image_sample_d_cl v[0:3], [v0, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
+; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2
+; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
+; GFX10-NEXT: image_sample_d_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -787,12 +787,12 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_b32_e32 v5, v8, v5
-; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
; GFX10-NEXT: v_and_b32_e32 v3, v8, v3
+; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
-; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX10-NEXT: v_lshl_or_b32 v6, v4, 16, v3
-; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v1, v6, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
+; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
+; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -888,13 +888,13 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_and_b32_e32 v5, v10, v5
; GFX10-NEXT: v_and_b32_e32 v3, v10, v3
; GFX10-NEXT: v_and_b32_e32 v1, v10, v1
-; GFX10-NEXT: v_and_b32_e32 v5, v10, v5
+; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
-; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX10-NEXT: v_lshl_or_b32 v6, v6, 16, v5
-; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v1, v3, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
+; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -942,13 +942,13 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_and_b32_e32 v4, v7, v4
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v4
-; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
-; GFX10-NEXT: image_sample_cd_cl v[0:3], [v0, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
+; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2
+; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
+; GFX10-NEXT: image_sample_cd_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -999,12 +999,12 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_b32_e32 v5, v8, v5
-; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
; GFX10-NEXT: v_and_b32_e32 v3, v8, v3
+; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
-; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX10-NEXT: v_lshl_or_b32 v6, v4, 16, v3
-; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v1, v6, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
+; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
+; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -1203,13 +1203,13 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_and_b32_e32 v6, v9, v6
; GFX10-NEXT: v_and_b32_e32 v4, v9, v4
; GFX10-NEXT: v_and_b32_e32 v2, v9, v2
-; GFX10-NEXT: v_and_b32_e32 v6, v9, v6
+; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT: v_lshl_or_b32 v7, v7, 16, v6
-; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v2, v4, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
+; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -1238,13 +1238,13 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_and_b32_e32 v6, v9, v6
; GFX10-NEXT: v_and_b32_e32 v4, v9, v4
; GFX10-NEXT: v_and_b32_e32 v2, v9, v2
-; GFX10-NEXT: v_and_b32_e32 v6, v9, v6
+; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
-; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; GFX10-NEXT: v_lshl_or_b32 v7, v7, 16, v6
-; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v2, v4, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
+; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
index 49891aebe701..0ca7ed991813 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
@@ -93,11 +93,11 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
-; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; encoding: [0x03,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
-; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x03,0x04,0x05,0x06]
+; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
+; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
+; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
+; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -209,11 +209,11 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
-; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; encoding: [0x03,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
-; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x03,0x04,0x05,0x06]
+; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
+; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
+; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
+; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
index e4214ef54cf8..7f7d5b376d71 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
@@ -93,11 +93,11 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
-; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
+; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@@ -209,11 +209,11 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
-; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
+; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
index 2078d8c22922..6e4838b6085e 100644
--- a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
+++ b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
@@ -319,8 +319,8 @@ body: |
...
# GCN-LABEL: smem_bundle{{$}}
-# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr15, 0, 0
# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr14, 0, 0
+# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr15, 0, 0
---
name: smem_bundle
tracksRegLiveness: true
More information about the llvm-commits
mailing list