[llvm] [AMDGPU] Allocate AVRegClass last (PR #146606)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 25 12:08:03 PDT 2025
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/146606
>From f3fc4e166519456eca573eec8a71adf43b8eecd1 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 27 Jun 2025 15:29:10 -0700
Subject: [PATCH 1/5] [AMDGPU] Allocate AVRegClass last
Change-Id: Iace3462f27ea276b22716793ebfa13b5026b0e58
---
llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 43 +-
.../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 240 +++----
llvm/test/CodeGen/AMDGPU/build_vector.ll | 28 +-
.../CodeGen/AMDGPU/fix-crash-valu-hazard.ll | 10 +-
.../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 240 +++----
.../AMDGPU/infer-addrspace-flat-atomic.ll | 24 +-
...-reg-class-snippet-copy-use-after-free.mir | 32 +-
.../AMDGPU/insert_vector_elt.v2bf16.ll | 14 +-
...llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll | 40 +-
.../AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll | 61 +-
.../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll | 116 ++--
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 424 ++++++------
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 148 ++---
....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 234 +++----
...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 400 ++++++------
.../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 606 +++++++++---------
llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 8 +-
...al-regcopy-and-spill-missed-at-regalloc.ll | 13 +-
.../AMDGPU/ptradd-sdag-optimizations.ll | 42 +-
.../regalloc-illegal-eviction-assert.ll | 4 +-
.../CodeGen/AMDGPU/scalar_to_vector.v8i16.ll | 36 +-
.../AMDGPU/shufflevector-physreg-copy.ll | 106 ++-
.../AMDGPU/tuple-allocation-failure.ll | 120 ++--
.../AMDGPU/undef-handling-crash-in-ra.ll | 47 +-
.../CodeGen/AMDGPU/vector_shuffle.packed.ll | 30 +-
.../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 258 ++++----
26 files changed, 1677 insertions(+), 1647 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 0039d2ffa100e..218841d182ff2 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -109,6 +109,23 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
let TSFlags{2} = HasVGPR;
let TSFlags{3} = HasAGPR;
let TSFlags{4} = HasSGPR;
+
+ // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block)
+ // to decide which registers to try to assign first. Usually, this RegisterClass priority is given
+ // very high priority, if not the highest priority, when considering which VirtReg to allocate next.
+ //
+ // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to
+ // assign more constrained RegisterClasses first. As a result, we prioritize register classes with
+ // more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32).
+ //
+ // The interesting case is the vector register case on architectures which have ARegs, VRegs, AVRegs.
+ // In this case, we would like to assign ARegs and VRegs before AVRegs, as AVRegs are less constrained
+ // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the
+ // RegisterClass AllocationPriority. BaseClassPriority is used to turn the bit on, and BaseClassScaleFactor
+ // is used for scaling of the bit (i.e. 1 << 4).
+ field int BaseClassPriority = 1;
+ field int BaseClassScaleFactor = 16;
+
}
multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
@@ -575,7 +592,7 @@ let HasVGPR = 1 in {
def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
(add (interleave (sequence "VGPR%u_LO16", 0, 255),
(sequence "VGPR%u_HI16", 0, 255)))> {
- let AllocationPriority = 2;
+ let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 16;
let GeneratePressureSet = 0;
@@ -601,7 +618,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
// i16/f16 only on VI+
def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
(add (sequence "VGPR%u", 0, 255))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 32;
let Weight = 1;
let BaseClassOrder = 32;
@@ -610,7 +627,7 @@ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types
// Identical to VGPR_32 except it only contains the low 128 (Lo128) registers.
def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
(add (sequence "VGPR%u", 0, 127))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
let GeneratePressureSet = 0;
let Size = 32;
let Weight = 1;
@@ -668,7 +685,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
// AccVGPR 32-bit registers
def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
(add (sequence "AGPR%u", 0, 255))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 32;
let Weight = 1;
let BaseClassOrder = 32;
@@ -940,14 +957,23 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
// Requires n v_mov_b32 to copy
let CopyCost = numRegs;
- let AllocationPriority = !sub(numRegs, 1);
+
+ // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the
+ // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result
+ // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for
+ // regsters with numRegs 17+ we give SizePriority of 15. In practice, there is only one
+ // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512},
+ // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing.
+ defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15));
+
+ let AllocationPriority = !add(SizePrioriity, !mul(BaseClassPriority, BaseClassScaleFactor));
let Weight = numRegs;
}
// Define a register tuple class, along with one requiring an even
// aligned base register.
multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
- let HasVGPR = 1 in {
+ let HasVGPR = 1, BaseClassPriority = 1 in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, regList> {
let BaseClassOrder = !mul(numRegs, 32);
@@ -981,7 +1007,7 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>;
}
multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
- let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in {
+ let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, regList> {
let BaseClassOrder = !mul(numRegs, 32);
@@ -1066,6 +1092,7 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6
def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> {
let HasVGPR = 1;
let HasAGPR = 1;
+ let BaseClassPriority = 0;
let Size = 32;
}
} // End GeneratePressureSet = 0
@@ -1074,7 +1101,7 @@ def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_3
// aligned base register.
multiclass AVRegClass<int numRegs, list<ValueType> regTypes,
dag vregList, dag aregList> {
- let HasVGPR = 1, HasAGPR = 1 in {
+ let HasVGPR = 1, HasAGPR = 1, BaseClassPriority = 0 in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 8192d4a6457d7..0e132f130c844 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -70,12 +70,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
@@ -85,12 +85,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -153,12 +153,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -168,12 +168,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -236,12 +236,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
@@ -251,12 +251,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -319,12 +319,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -334,12 +334,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -402,12 +402,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
@@ -417,12 +417,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -485,12 +485,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -500,12 +500,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -568,12 +568,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
@@ -583,12 +583,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -651,12 +651,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -666,12 +666,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -734,12 +734,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
@@ -749,12 +749,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -817,12 +817,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -832,12 +832,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -900,12 +900,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
@@ -915,12 +915,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -983,12 +983,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -998,12 +998,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 7208eaeff8eb1..763f436997c21 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -51,11 +51,11 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
; GFX942-LABEL: build_vector2:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, 5
-; GFX942-NEXT: v_mov_b32_e32 v1, 6
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 5
+; GFX942-NEXT: v_mov_b32_e32 v3, 6
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
store <2 x i32> <i32 5, i32 6>, ptr addrspace(1) %out
@@ -116,13 +116,13 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
; GFX942-LABEL: build_vector4:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, 5
-; GFX942-NEXT: v_mov_b32_e32 v1, 6
-; GFX942-NEXT: v_mov_b32_e32 v2, 7
-; GFX942-NEXT: v_mov_b32_e32 v3, 8
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 5
+; GFX942-NEXT: v_mov_b32_e32 v3, 6
+; GFX942-NEXT: v_mov_b32_e32 v4, 7
+; GFX942-NEXT: v_mov_b32_e32 v5, 8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, ptr addrspace(1) %out
@@ -307,13 +307,13 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
; GFX942-LABEL: build_v2i32_from_v4i16_shuffle:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_lshl_b32 s3, s3, 16
; GFX942-NEXT: s_lshl_b32 s2, s2, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, s2
-; GFX942-NEXT: v_mov_b32_e32 v1, s3
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, s2
+; GFX942-NEXT: v_mov_b32_e32 v3, s3
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
%shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
index 87811968c7871..4f752d102db74 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
@@ -8,10 +8,10 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt
; GFX942-LABEL: global_load_lds_dword_saddr:
; GFX942: ; %bb.0: ; %main_body
; GFX942-NEXT: v_readfirstlane_b32 s2, v0
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_mov_b32 m0, s2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_load_lds_dword v2, s[0:1] offset:32 nt
+; GFX942-NEXT: global_load_lds_dword v1, s[0:1] offset:32 nt
; GFX942-NEXT: s_getpc_b64 s[0:1]
; GFX942-NEXT: s_add_u32 s0, s0, G at gotpcrel32@lo+4
; GFX942-NEXT: s_addc_u32 s1, s1, G at gotpcrel32@hi+12
@@ -21,9 +21,9 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_mul_i32 s3, s3, 10
; GFX942-NEXT: s_mul_i32 s2, s2, 10
-; GFX942-NEXT: v_mov_b32_e32 v0, s2
-; GFX942-NEXT: v_mov_b32_e32 v1, s3
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, s2
+; GFX942-NEXT: v_mov_b32_e32 v3, s3
+; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX90A-LABEL: global_load_lds_dword_saddr:
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 873fceedd7b72..6067194d947fa 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -71,12 +71,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
@@ -86,12 +86,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -154,12 +154,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -169,12 +169,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -237,12 +237,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
@@ -252,12 +252,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -320,12 +320,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -335,12 +335,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -403,12 +403,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
@@ -418,12 +418,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -486,12 +486,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -501,12 +501,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -569,12 +569,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
@@ -584,12 +584,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -652,12 +652,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -667,12 +667,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -735,12 +735,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
@@ -750,12 +750,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -818,12 +818,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -833,12 +833,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -901,12 +901,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
@@ -916,12 +916,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -984,12 +984,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -999,12 +999,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
index 258aa9e299c3d..0a493e5188ad5 100644
--- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
@@ -8,15 +8,15 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) {
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_ashr_i32 s7, s6, 31
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-NEXT: v_mov_b32_e32 v3, s3
; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3
; CHECK-NEXT: s_add_u32 s0, s2, s0
; CHECK-NEXT: s_addc_u32 s1, s3, s1
-; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0
-; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: s_endpgm
@@ -35,15 +35,15 @@ define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, doub
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_ashr_i32 s7, s6, 31
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-NEXT: v_mov_b32_e32 v3, s3
; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3
; CHECK-NEXT: s_add_u32 s0, s0, s2
; CHECK-NEXT: s_addc_u32 s1, s1, s3
-; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0
-; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
index 11de6c8d52d59..06c3da09eede9 100644
--- a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
@@ -32,32 +32,14 @@
# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY undef $vgpr2_vgpr3 {
# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY undef $vgpr0
# CHECK-NEXT: }
-# CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT1]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT1]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: SI_SPILL_AV512_SAVE [[SPLIT2]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5)
-# CHECK-NEXT: [[RESTORE1:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
-# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub0_sub1:av_512_align2 = COPY [[RESTORE1]].sub0_sub1
-# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5)
-# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[RESTORE2]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT3]].sub0:av_512_align2 = COPY [[RESTORE2]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: undef [[SPLIT4:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT3]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT4]].sub0:av_512_align2 = COPY [[SPLIT3]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: [[SPLIT5:%[0-9]+]].sub2:av_512_align2 = COPY [[SPLIT4]].sub3
-# CHECK-NEXT: undef [[SPLIT6:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT5]].sub0_sub1_sub2
-# CHECK-NEXT: undef [[SPLIT7:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT6]].sub0_sub1_sub2
-# CHECK-NEXT: undef [[SPLIT8:%[0-9]+]].sub0:av_512_align2 = COPY [[SPLIT4]].sub0 {
-# CHECK-NEXT: internal [[SPLIT8]].sub2:av_512_align2 = COPY [[SPLIT4]].sub2
+# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
# CHECK-NEXT: }
-# CHECK-NEXT: [[SPLIT9:%[0-9]+]].sub3:av_512_align2 = COPY [[SPLIT8]].sub2
-# CHECK-NEXT: undef [[SPLIT10:%[0-9]+]].sub0_sub1_sub2_sub3:av_512_align2 = COPY [[SPLIT9]].sub0_sub1_sub2_sub3
-# CHECK-NEXT: undef [[SPLIT13:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_512_align2 = COPY [[SPLIT10]].sub0_sub1_sub2_sub3
-# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub4:vreg_512_align2 = COPY [[SPLIT8]].sub0
+# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub0_sub1:vreg_512_align2 = COPY [[RESTORE2]].sub0_sub1
+# CHECK-NEXT: [[MFMA_USE1]].sub2:vreg_512_align2 = COPY [[SPLIT2]].sub3
+# CHECK-NEXT: [[MFMA_USE1]].sub3:vreg_512_align2 = COPY [[SPLIT2]].sub2
+# CHECK-NEXT: [[MFMA_USE1]].sub4:vreg_512_align2 = COPY [[SPLIT2]].sub0
# CHECK-NEXT: [[MFMA_USE1]].sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
# CHECK-NEXT: [[MFMA_USE1]].sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
# CHECK-NEXT: [[MFMA_USE1]].sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index 1ac75d399e67d..d8c983a081b98 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -1331,16 +1331,16 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX942-NEXT: s_load_dword s6, s[4:5], 0x10
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, 0x5040100
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 5, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0x5040100
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3]
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v4, s[2:3] offset:16
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_perm_b32 v1, s6, v1, v9
+; GFX942-NEXT: v_perm_b32 v1, s6, v1, v5
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
index 85dd275207902..fcdad53553823 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
@@ -4,30 +4,30 @@
define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(<1 x i64> %L1) {
; GCN-LABEL: test_iglp_opt_rev_mfma_gemm:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: v_mov_b32_e32 v32, 0
-; GCN-NEXT: ds_read_b128 v[0:3], v32
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: ds_read_b128 v[2:5], v0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GCN-NEXT: ds_read_b128 v[28:31], v32 offset:112
-; GCN-NEXT: ds_read_b128 v[24:27], v32 offset:96
-; GCN-NEXT: ds_read_b128 v[20:23], v32 offset:80
-; GCN-NEXT: ds_read_b128 v[16:19], v32 offset:64
-; GCN-NEXT: ds_read_b128 v[4:7], v32 offset:16
-; GCN-NEXT: ds_read_b128 v[8:11], v32 offset:32
-; GCN-NEXT: ds_read_b128 v[12:15], v32 offset:48
+; GCN-NEXT: ds_read_b128 v[30:33], v0 offset:112
+; GCN-NEXT: ds_read_b128 v[26:29], v0 offset:96
+; GCN-NEXT: ds_read_b128 v[22:25], v0 offset:80
+; GCN-NEXT: ds_read_b128 v[18:21], v0 offset:64
+; GCN-NEXT: ds_read_b128 v[6:9], v0 offset:16
+; GCN-NEXT: ds_read_b128 v[10:13], v0 offset:32
+; GCN-NEXT: ds_read_b128 v[14:17], v0 offset:48
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: ds_write_b128 v32, v[0:3]
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: ds_write_b128 v0, v[2:5]
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_mov_b32_e32 v3, v2
; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0
; GCN-NEXT: ; iglp_opt mask(0x00000001)
-; GCN-NEXT: ds_write_b128 v32, v[28:31] offset:112
-; GCN-NEXT: ds_write_b128 v32, v[24:27] offset:96
-; GCN-NEXT: ds_write_b128 v32, v[20:23] offset:80
-; GCN-NEXT: ds_write_b128 v32, v[16:19] offset:64
-; GCN-NEXT: ds_write_b128 v32, v[12:15] offset:48
-; GCN-NEXT: ds_write_b128 v32, v[8:11] offset:32
-; GCN-NEXT: ds_write_b128 v32, v[4:7] offset:16
-; GCN-NEXT: ds_write_b64 v32, v[0:1]
+; GCN-NEXT: ds_write_b128 v0, v[30:33] offset:112
+; GCN-NEXT: ds_write_b128 v0, v[26:29] offset:96
+; GCN-NEXT: ds_write_b128 v0, v[22:25] offset:80
+; GCN-NEXT: ds_write_b128 v0, v[18:21] offset:64
+; GCN-NEXT: ds_write_b128 v0, v[14:17] offset:48
+; GCN-NEXT: ds_write_b128 v0, v[10:13] offset:32
+; GCN-NEXT: ds_write_b128 v0, v[6:9] offset:16
+; GCN-NEXT: ds_write_b64 v0, v[2:3]
; GCN-NEXT: s_endpgm
entry:
call void @llvm.amdgcn.iglp.opt(i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
index ed7d88b162bac..dcac419f8591d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
@@ -18,19 +18,22 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1
; GCN-LABEL: load_1d_lwe:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: v_mov_b32_e32 v8, 0
-; GCN-NEXT: v_mov_b32_e32 v6, v0
; GCN-NEXT: v_mov_b32_e32 v9, v8
; GCN-NEXT: v_mov_b32_e32 v10, v8
; GCN-NEXT: v_mov_b32_e32 v11, v8
; GCN-NEXT: v_mov_b32_e32 v12, v8
-; GCN-NEXT: v_mov_b32_e32 v0, v8
-; GCN-NEXT: v_mov_b32_e32 v1, v9
-; GCN-NEXT: v_mov_b32_e32 v2, v10
-; GCN-NEXT: v_mov_b32_e32 v3, v11
-; GCN-NEXT: v_mov_b32_e32 v4, v12
-; GCN-NEXT: image_load v[0:4], v6, s[0:7] dmask:0xf unorm lwe
+; GCN-NEXT: v_mov_b32_e32 v2, v8
+; GCN-NEXT: v_mov_b32_e32 v3, v9
+; GCN-NEXT: v_mov_b32_e32 v4, v10
+; GCN-NEXT: v_mov_b32_e32 v5, v11
+; GCN-NEXT: v_mov_b32_e32 v6, v12
+; GCN-NEXT: image_load v[2:6], v0, s[0:7] dmask:0xf unorm lwe
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dword v8, v4, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: v_mov_b32_e32 v2, v4
+; GCN-NEXT: v_mov_b32_e32 v3, v5
+; GCN-NEXT: global_store_dword v8, v6, s[8:9]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
@@ -75,6 +78,27 @@ main_body:
}
define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
+; GCN-LABEL: load_cube_lwe:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v11, v10
+; GCN-NEXT: v_mov_b32_e32 v12, v10
+; GCN-NEXT: v_mov_b32_e32 v13, v10
+; GCN-NEXT: v_mov_b32_e32 v14, v10
+; GCN-NEXT: v_mov_b32_e32 v4, v10
+; GCN-NEXT: v_mov_b32_e32 v5, v11
+; GCN-NEXT: v_mov_b32_e32 v6, v12
+; GCN-NEXT: v_mov_b32_e32 v7, v13
+; GCN-NEXT: v_mov_b32_e32 v8, v14
+; GCN-NEXT: image_load v[4:8], v[0:2], s[0:7] dmask:0xf unorm lwe da
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, v4
+; GCN-NEXT: v_mov_b32_e32 v1, v5
+; GCN-NEXT: v_mov_b32_e32 v2, v6
+; GCN-NEXT: v_mov_b32_e32 v3, v7
+; GCN-NEXT: global_store_dword v10, v8, s[8:9]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
@@ -106,6 +130,27 @@ main_body:
}
define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
+; GCN-LABEL: load_2darray_lwe:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v11, v10
+; GCN-NEXT: v_mov_b32_e32 v12, v10
+; GCN-NEXT: v_mov_b32_e32 v13, v10
+; GCN-NEXT: v_mov_b32_e32 v14, v10
+; GCN-NEXT: v_mov_b32_e32 v4, v10
+; GCN-NEXT: v_mov_b32_e32 v5, v11
+; GCN-NEXT: v_mov_b32_e32 v6, v12
+; GCN-NEXT: v_mov_b32_e32 v7, v13
+; GCN-NEXT: v_mov_b32_e32 v8, v14
+; GCN-NEXT: image_load v[4:8], v[0:2], s[0:7] dmask:0xf unorm lwe da
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, v4
+; GCN-NEXT: v_mov_b32_e32 v1, v5
+; GCN-NEXT: v_mov_b32_e32 v2, v6
+; GCN-NEXT: v_mov_b32_e32 v3, v7
+; GCN-NEXT: global_store_dword v10, v8, s[8:9]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 452033f332659..d358837452eab 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -15,9 +15,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GCN-NEXT: v_mov_b64_e32 v[12:13], 48
-; GCN-NEXT: v_mov_b64_e32 v[14:15], 32
-; GCN-NEXT: v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT: v_mov_b64_e32 v[8:9], 48
+; GCN-NEXT: v_mov_b64_e32 v[10:11], 32
+; GCN-NEXT: v_mov_b64_e32 v[12:13], 16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -39,42 +39,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
-; GCN-NEXT: v_mov_b64_e32 v[18:19], 0
-; GCN-NEXT: v_mov_b32_e32 v8, s16
+; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
+; GCN-NEXT: v_mov_b32_e32 v16, s16
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15]
; GCN-NEXT: v_mov_b32_e32 v0, s20
; GCN-NEXT: v_mov_b32_e32 v1, s21
; GCN-NEXT: v_mov_b32_e32 v2, s22
; GCN-NEXT: v_mov_b32_e32 v3, s23
-; GCN-NEXT: v_mov_b32_e32 v9, s17
-; GCN-NEXT: v_mov_b32_e32 v10, s18
-; GCN-NEXT: v_mov_b32_e32 v11, s19
+; GCN-NEXT: v_mov_b32_e32 v17, s17
+; GCN-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NEXT: v_mov_b32_e32 v19, s19
; GCN-NEXT: s_nop 4
-; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -88,9 +88,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GCN-NEXT: v_mov_b64_e32 v[12:13], 48
-; GCN-NEXT: v_mov_b64_e32 v[14:15], 32
-; GCN-NEXT: v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT: v_mov_b64_e32 v[8:9], 48
+; GCN-NEXT: v_mov_b64_e32 v[10:11], 32
+; GCN-NEXT: v_mov_b64_e32 v[12:13], 16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -112,42 +112,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
-; GCN-NEXT: v_mov_b64_e32 v[18:19], 0
-; GCN-NEXT: v_mov_b32_e32 v8, s16
+; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
+; GCN-NEXT: v_mov_b32_e32 v16, s16
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; GCN-NEXT: v_mov_b32_e32 v0, s20
; GCN-NEXT: v_mov_b32_e32 v1, s21
; GCN-NEXT: v_mov_b32_e32 v2, s22
; GCN-NEXT: v_mov_b32_e32 v3, s23
-; GCN-NEXT: v_mov_b32_e32 v9, s17
-; GCN-NEXT: v_mov_b32_e32 v10, s18
-; GCN-NEXT: v_mov_b32_e32 v11, s19
+; GCN-NEXT: v_mov_b32_e32 v17, s17
+; GCN-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NEXT: v_mov_b32_e32 v19, s19
; GCN-NEXT: s_nop 4
-; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
@@ -252,7 +252,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -274,40 +274,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
; GCN-NEXT: v_accvgpr_write_b32 a18, s10
; GCN-NEXT: v_accvgpr_write_b32 a17, s9
; GCN-NEXT: v_accvgpr_write_b32 a16, s8
-; GCN-NEXT: v_mov_b32_e32 v8, s20
-; GCN-NEXT: v_mov_b32_e32 v9, s21
+; GCN-NEXT: v_mov_b32_e32 v10, s20
+; GCN-NEXT: v_mov_b32_e32 v11, s21
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31]
-; GCN-NEXT: v_mov_b32_e32 v10, s22
-; GCN-NEXT: v_mov_b32_e32 v11, s23
+; GCN-NEXT: v_mov_b32_e32 v12, s22
+; GCN-NEXT: v_mov_b32_e32 v13, s23
; GCN-NEXT: v_mov_b32_e32 v0, s16
; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: v_mov_b32_e32 v3, s19
-; GCN-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -322,7 +322,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -344,40 +344,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
; GCN-NEXT: v_accvgpr_write_b32 a18, s10
; GCN-NEXT: v_accvgpr_write_b32 a17, s9
; GCN-NEXT: v_accvgpr_write_b32 a16, s8
-; GCN-NEXT: v_mov_b32_e32 v8, s20
-; GCN-NEXT: v_mov_b32_e32 v9, s21
+; GCN-NEXT: v_mov_b32_e32 v10, s20
+; GCN-NEXT: v_mov_b32_e32 v11, s21
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; GCN-NEXT: v_mov_b32_e32 v10, s22
-; GCN-NEXT: v_mov_b32_e32 v11, s23
+; GCN-NEXT: v_mov_b32_e32 v12, s22
+; GCN-NEXT: v_mov_b32_e32 v13, s23
; GCN-NEXT: v_mov_b32_e32 v0, s16
; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: v_mov_b32_e32 v3, s19
-; GCN-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 9bdae28f935c1..13e63a5a1e851 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -394,9 +394,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -418,42 +418,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
; SDAG-NEXT: v_mov_b32_e32 v0, s20
; SDAG-NEXT: v_mov_b32_e32 v1, s21
; SDAG-NEXT: v_mov_b32_e32 v2, s22
; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
; SDAG-NEXT: s_nop 4
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -776,9 +776,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -800,42 +800,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; SDAG-NEXT: v_mov_b32_e32 v0, s20
; SDAG-NEXT: v_mov_b32_e32 v1, s21
; SDAG-NEXT: v_mov_b32_e32 v2, s22
; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
; SDAG-NEXT: s_nop 4
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -1505,7 +1505,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v8, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -1527,40 +1527,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-NEXT: v_mov_b32_e32 v11, s21
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-NEXT: v_mov_b32_e32 v13, s23
; SDAG-NEXT: v_mov_b32_e32 v0, s16
; SDAG-NEXT: v_mov_b32_e32 v1, s17
; SDAG-NEXT: v_mov_b32_e32 v2, s18
; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -1869,7 +1869,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v8, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -1891,40 +1891,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-NEXT: v_mov_b32_e32 v11, s21
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-NEXT: v_mov_b32_e32 v13, s23
; SDAG-NEXT: v_mov_b32_e32 v0, s16
; SDAG-NEXT: v_mov_b32_e32 v1, s17
; SDAG-NEXT: v_mov_b32_e32 v2, s18
; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -2781,24 +2781,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3]
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2930,24 +2930,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -3084,19 +3084,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s24
-; SDAG-NEXT: v_mov_b32_e32 v1, s25
-; SDAG-NEXT: v_mov_b32_e32 v2, s26
-; SDAG-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-NEXT: v_mov_b32_e32 v8, s24
+; SDAG-NEXT: v_mov_b32_e32 v9, s25
+; SDAG-NEXT: v_mov_b32_e32 v10, s26
+; SDAG-NEXT: v_mov_b32_e32 v11, s27
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v4, s28
-; SDAG-NEXT: v_mov_b32_e32 v5, s29
-; SDAG-NEXT: v_mov_b32_e32 v6, s30
-; SDAG-NEXT: v_mov_b32_e32 v7, s31
+; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v13, s29
+; SDAG-NEXT: v_mov_b32_e32 v14, s30
+; SDAG-NEXT: v_mov_b32_e32 v15, s31
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3112,44 +3112,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15]
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s22
+; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -3496,19 +3494,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s24
-; SDAG-NEXT: v_mov_b32_e32 v1, s25
-; SDAG-NEXT: v_mov_b32_e32 v2, s26
-; SDAG-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-NEXT: v_mov_b32_e32 v8, s24
+; SDAG-NEXT: v_mov_b32_e32 v9, s25
+; SDAG-NEXT: v_mov_b32_e32 v10, s26
+; SDAG-NEXT: v_mov_b32_e32 v11, s27
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v4, s28
-; SDAG-NEXT: v_mov_b32_e32 v5, s29
-; SDAG-NEXT: v_mov_b32_e32 v6, s30
-; SDAG-NEXT: v_mov_b32_e32 v7, s31
+; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v13, s29
+; SDAG-NEXT: v_mov_b32_e32 v14, s30
+; SDAG-NEXT: v_mov_b32_e32 v15, s31
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3524,44 +3522,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s22
+; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4254,17 +4250,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v6, s24
+; SDAG-NEXT: v_mov_b32_e32 v7, s25
+; SDAG-NEXT: v_mov_b32_e32 v8, s26
+; SDAG-NEXT: v_mov_b32_e32 v9, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
@@ -4283,41 +4279,41 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31]
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4653,17 +4649,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v6, s24
+; SDAG-NEXT: v_mov_b32_e32 v7, s25
+; SDAG-NEXT: v_mov_b32_e32 v8, s26
+; SDAG-NEXT: v_mov_b32_e32 v9, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
@@ -4682,41 +4678,41 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index ff305da49852c..a959ead39c4ac 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -1114,19 +1114,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v2
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v0
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v1
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s15
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s14
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s15
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s0
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s0
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s1
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v6
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v2
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v3
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s3
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v0
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v1
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s2
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s3
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 1
@@ -1254,19 +1254,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v2
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v0
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v1
-; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14
-; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s15
+; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s14
+; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s15
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s0
-; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1
+; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s1
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v6
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v2
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v3
-; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2
-; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s3
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v0
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v1
+; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s2
+; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s3
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 1
@@ -1330,7 +1330,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX90A-LABEL: test_mfma_f32_32x32x4f16:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40
; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0
@@ -1345,8 +1345,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NEXT: v_mov_b32_e32 v2, s0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s1
; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20
; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21
; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22
@@ -1371,27 +1371,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13
; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s3
+; GFX90A-NEXT: v_mov_b32_e32 v4, s2
+; GFX90A-NEXT: v_mov_b32_e32 v5, s3
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: global_store_dwordx4 v4, a[24:27], s[36:37] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v4, a[28:31], s[36:37] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v4, a[16:19], s[36:37] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v4, a[20:23], s[36:37] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v4, a[8:11], s[36:37] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v4, a[12:15], s[36:37] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[36:37]
-; GFX90A-NEXT: global_store_dwordx4 v4, a[4:7], s[36:37] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[36:37] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[36:37] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[36:37] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[36:37] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[36:37]
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[36:37] offset:16
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_32x32x4f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40
; GFX942-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0
@@ -1406,8 +1406,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: v_accvgpr_write_b32 a2, s18
; GFX942-NEXT: v_accvgpr_write_b32 a3, s19
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s0
-; GFX942-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NEXT: v_mov_b32_e32 v2, s0
+; GFX942-NEXT: v_mov_b32_e32 v3, s1
; GFX942-NEXT: v_accvgpr_write_b32 a4, s20
; GFX942-NEXT: v_accvgpr_write_b32 a5, s21
; GFX942-NEXT: v_accvgpr_write_b32 a6, s22
@@ -1432,21 +1432,21 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: v_accvgpr_write_b32 a29, s13
; GFX942-NEXT: v_accvgpr_write_b32 a30, s14
; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
-; GFX942-NEXT: v_mov_b32_e32 v2, s2
-; GFX942-NEXT: v_mov_b32_e32 v3, s3
+; GFX942-NEXT: v_mov_b32_e32 v4, s2
+; GFX942-NEXT: v_mov_b32_e32 v5, s3
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v4, a[24:27], s[36:37] offset:96
-; GFX942-NEXT: global_store_dwordx4 v4, a[28:31], s[36:37] offset:112
-; GFX942-NEXT: global_store_dwordx4 v4, a[16:19], s[36:37] offset:64
-; GFX942-NEXT: global_store_dwordx4 v4, a[20:23], s[36:37] offset:80
-; GFX942-NEXT: global_store_dwordx4 v4, a[8:11], s[36:37] offset:32
-; GFX942-NEXT: global_store_dwordx4 v4, a[12:15], s[36:37] offset:48
-; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[36:37]
-; GFX942-NEXT: global_store_dwordx4 v4, a[4:7], s[36:37] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[36:37] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[36:37] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[36:37] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[36:37] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[36:37]
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[36:37] offset:16
; GFX942-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -1752,45 +1752,45 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add
; GFX90A-LABEL: test_mfma_f32_4x4x4f16:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: v_mov_b32_e32 v3, s5
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX90A-NEXT: v_mov_b32_e32 v2, s6
-; GFX90A-NEXT: v_mov_b32_e32 v3, s7
+; GFX90A-NEXT: v_mov_b32_e32 v4, s6
+; GFX90A-NEXT: v_mov_b32_e32 v5, s7
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9
; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 4
-; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_4x4x4f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s4
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-NEXT: v_mov_b32_e32 v2, s4
+; GFX942-NEXT: v_mov_b32_e32 v3, s5
; GFX942-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX942-NEXT: v_mov_b32_e32 v2, s6
-; GFX942-NEXT: v_mov_b32_e32 v3, s7
+; GFX942-NEXT: v_mov_b32_e32 v4, s6
+; GFX942-NEXT: v_mov_b32_e32 v5, s7
; GFX942-NEXT: v_accvgpr_write_b32 a1, s9
; GFX942-NEXT: v_accvgpr_write_b32 a2, s10
; GFX942-NEXT: v_accvgpr_write_b32 a3, s11
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_4x4x4_16b_f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_4x4x4_16b_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 4
-; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -2099,46 +2099,46 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
; GFX90A-LABEL: test_mfma_f32_16x16x16f16:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: v_mov_b32_e32 v3, s5
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX90A-NEXT: v_mov_b32_e32 v2, s6
-; GFX90A-NEXT: v_mov_b32_e32 v3, s7
+; GFX90A-NEXT: v_mov_b32_e32 v4, s6
+; GFX90A-NEXT: v_mov_b32_e32 v5, s7
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9
; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_16x16x16f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s4
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-NEXT: v_mov_b32_e32 v2, s4
+; GFX942-NEXT: v_mov_b32_e32 v3, s5
; GFX942-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX942-NEXT: v_mov_b32_e32 v2, s6
-; GFX942-NEXT: v_mov_b32_e32 v3, s7
+; GFX942-NEXT: v_mov_b32_e32 v4, s6
+; GFX942-NEXT: v_mov_b32_e32 v5, s7
; GFX942-NEXT: v_accvgpr_write_b32 a1, s9
; GFX942-NEXT: v_accvgpr_write_b32 a2, s10
; GFX942-NEXT: v_accvgpr_write_b32 a3, s11
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 6
-; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 04ee0bbd17673..37809da10241b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -1485,30 +1485,30 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v12, s0
-; SDAG-NEXT: v_mov_b32_e32 v13, s1
-; SDAG-NEXT: v_mov_b32_e32 v14, s2
-; SDAG-NEXT: v_mov_b32_e32 v15, s3
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
-; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mov_b32_e32 v18, s18
-; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: v_mov_b32_e32 v20, s28
-; SDAG-NEXT: v_mov_b32_e32 v21, s29
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
-; SDAG-NEXT: v_mov_b32_e32 v8, s24
-; SDAG-NEXT: v_mov_b32_e32 v9, s25
-; SDAG-NEXT: v_mov_b32_e32 v10, s26
-; SDAG-NEXT: v_mov_b32_e32 v11, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v21
+; SDAG-NEXT: v_mov_b32_e32 v14, s0
+; SDAG-NEXT: v_mov_b32_e32 v15, s1
+; SDAG-NEXT: v_mov_b32_e32 v16, s2
+; SDAG-NEXT: v_mov_b32_e32 v17, s3
+; SDAG-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s24
+; SDAG-NEXT: v_mov_b32_e32 v11, s25
+; SDAG-NEXT: v_mov_b32_e32 v12, s26
+; SDAG-NEXT: v_mov_b32_e32 v13, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v4
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v5
; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1895,36 +1895,36 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s16
+; SDAG-NEXT: v_mov_b32_e32 v11, s17
+; SDAG-NEXT: v_mov_b32_e32 v12, s18
+; SDAG-NEXT: v_mov_b32_e32 v13, s19
+; SDAG-NEXT: v_mov_b32_e32 v14, s20
+; SDAG-NEXT: v_mov_b32_e32 v15, s21
+; SDAG-NEXT: v_mov_b32_e32 v16, s22
+; SDAG-NEXT: v_mov_b32_e32 v17, s23
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s12, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15]
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[14:15]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
@@ -1964,33 +1964,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
; SDAG-NEXT: s_movk_i32 s6, 0x41
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s16
+; SDAG-NEXT: v_mov_b32_e32 v11, s17
+; SDAG-NEXT: v_mov_b32_e32 v12, s18
+; SDAG-NEXT: v_mov_b32_e32 v13, s19
+; SDAG-NEXT: v_mov_b32_e32 v14, s20
+; SDAG-NEXT: v_mov_b32_e32 v15, s21
+; SDAG-NEXT: v_mov_b32_e32 v16, s22
+; SDAG-NEXT: v_mov_b32_e32 v17, s23
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
@@ -2031,33 +2031,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
; SDAG-NEXT: s_movk_i32 s6, 0x41
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s16
+; SDAG-NEXT: v_mov_b32_e32 v11, s17
+; SDAG-NEXT: v_mov_b32_e32 v12, s18
+; SDAG-NEXT: v_mov_b32_e32 v13, s19
+; SDAG-NEXT: v_mov_b32_e32 v14, s20
+; SDAG-NEXT: v_mov_b32_e32 v15, s21
+; SDAG-NEXT: v_mov_b32_e32 v16, s22
+; SDAG-NEXT: v_mov_b32_e32 v17, s23
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
@@ -2096,34 +2096,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s16
+; SDAG-NEXT: v_mov_b32_e32 v11, s17
+; SDAG-NEXT: v_mov_b32_e32 v12, s18
+; SDAG-NEXT: v_mov_b32_e32 v13, s19
+; SDAG-NEXT: v_mov_b32_e32 v14, s20
+; SDAG-NEXT: v_mov_b32_e32 v15, s21
+; SDAG-NEXT: v_mov_b32_e32 v16, s22
+; SDAG-NEXT: v_mov_b32_e32 v17, s23
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
@@ -2162,34 +2162,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s16
+; SDAG-NEXT: v_mov_b32_e32 v11, s17
+; SDAG-NEXT: v_mov_b32_e32 v12, s18
+; SDAG-NEXT: v_mov_b32_e32 v13, s19
+; SDAG-NEXT: v_mov_b32_e32 v14, s20
+; SDAG-NEXT: v_mov_b32_e32 v15, s21
+; SDAG-NEXT: v_mov_b32_e32 v16, s22
+; SDAG-NEXT: v_mov_b32_e32 v17, s23
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 91197f915b659..bc50058778dbf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -3515,26 +3515,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v24, s0
-; SDAG-NEXT: v_mov_b32_e32 v25, s1
-; SDAG-NEXT: v_mov_b32_e32 v26, s2
-; SDAG-NEXT: v_mov_b32_e32 v27, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s28
-; SDAG-NEXT: v_mov_b32_e32 v33, s29
-; SDAG-NEXT: v_mov_b32_e32 v16, s20
-; SDAG-NEXT: v_mov_b32_e32 v17, s21
-; SDAG-NEXT: v_mov_b32_e32 v18, s22
-; SDAG-NEXT: v_mov_b32_e32 v19, s23
-; SDAG-NEXT: v_mov_b32_e32 v20, s24
-; SDAG-NEXT: v_mov_b32_e32 v21, s25
-; SDAG-NEXT: v_mov_b32_e32 v22, s26
-; SDAG-NEXT: v_mov_b32_e32 v23, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v32
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v33
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v30, s16
+; SDAG-NEXT: v_mov_b32_e32 v31, s17
+; SDAG-NEXT: v_mov_b32_e32 v32, s18
+; SDAG-NEXT: v_mov_b32_e32 v33, s19
+; SDAG-NEXT: v_mov_b32_e32 v16, s28
+; SDAG-NEXT: v_mov_b32_e32 v17, s29
+; SDAG-NEXT: v_mov_b32_e32 v18, s20
+; SDAG-NEXT: v_mov_b32_e32 v19, s21
+; SDAG-NEXT: v_mov_b32_e32 v20, s22
+; SDAG-NEXT: v_mov_b32_e32 v21, s23
+; SDAG-NEXT: v_mov_b32_e32 v22, s24
+; SDAG-NEXT: v_mov_b32_e32 v23, s25
+; SDAG-NEXT: v_mov_b32_e32 v24, s26
+; SDAG-NEXT: v_mov_b32_e32 v25, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
; SDAG-NEXT: v_accvgpr_write_b32 a4, v2
@@ -3550,7 +3550,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[24:31], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[18:25], a[0:15], v14, v15 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -3993,34 +3993,34 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v16, s0
-; SDAG-NEXT: v_mov_b32_e32 v17, s1
-; SDAG-NEXT: v_mov_b32_e32 v18, s2
-; SDAG-NEXT: v_mov_b32_e32 v19, s3
-; SDAG-NEXT: v_mov_b32_e32 v20, s16
-; SDAG-NEXT: v_mov_b32_e32 v21, s17
-; SDAG-NEXT: v_mov_b32_e32 v22, s18
-; SDAG-NEXT: v_mov_b32_e32 v23, s19
-; SDAG-NEXT: v_mov_b32_e32 v24, s20
-; SDAG-NEXT: v_mov_b32_e32 v25, s21
-; SDAG-NEXT: v_mov_b32_e32 v26, s22
-; SDAG-NEXT: v_mov_b32_e32 v27, s23
-; SDAG-NEXT: v_mov_b32_e32 v28, s24
-; SDAG-NEXT: v_mov_b32_e32 v29, s25
-; SDAG-NEXT: v_mov_b32_e32 v30, s26
-; SDAG-NEXT: v_mov_b32_e32 v31, s27
-; SDAG-NEXT: v_mov_b32_e32 v32, s28
-; SDAG-NEXT: v_mov_b32_e32 v33, s29
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v31
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v32
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v33
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v30, s16
+; SDAG-NEXT: v_mov_b32_e32 v31, s17
+; SDAG-NEXT: v_mov_b32_e32 v32, s18
+; SDAG-NEXT: v_mov_b32_e32 v33, s19
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b32_e32 v20, s24
+; SDAG-NEXT: v_mov_b32_e32 v21, s25
+; SDAG-NEXT: v_mov_b32_e32 v22, s26
+; SDAG-NEXT: v_mov_b32_e32 v23, s27
+; SDAG-NEXT: v_mov_b32_e32 v24, s28
+; SDAG-NEXT: v_mov_b32_e32 v25, s29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
; SDAG-NEXT: v_accvgpr_write_b32 a10, v8
; SDAG-NEXT: v_accvgpr_write_b32 a11, v9
; SDAG-NEXT: v_accvgpr_write_b32 a12, v10
@@ -4028,7 +4028,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -4540,22 +4540,22 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a0, s36
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v10, s16
+; SDAG-NEXT: v_mov_b32_e32 v11, s17
+; SDAG-NEXT: v_mov_b32_e32 v12, s18
+; SDAG-NEXT: v_mov_b32_e32 v13, s19
+; SDAG-NEXT: v_mov_b32_e32 v14, s20
+; SDAG-NEXT: v_mov_b32_e32 v15, s21
+; SDAG-NEXT: v_mov_b32_e32 v16, s22
+; SDAG-NEXT: v_mov_b32_e32 v17, s23
; SDAG-NEXT: v_accvgpr_write_b32 a1, s37
; SDAG-NEXT: v_accvgpr_write_b32 a2, s38
; SDAG-NEXT: v_accvgpr_write_b32 a3, s39
@@ -4571,9 +4571,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
; SDAG-NEXT: v_accvgpr_write_b32 a13, s49
; SDAG-NEXT: v_accvgpr_write_b32 a14, s50
; SDAG-NEXT: v_accvgpr_write_b32 a15, s51
-; SDAG-NEXT: v_mov_b32_e32 v16, s1
+; SDAG-NEXT: v_mov_b32_e32 v0, s1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], s0, v0 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
@@ -4735,26 +4735,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s16
+; SDAG-NEXT: v_mov_b32_e32 v7, s17
+; SDAG-NEXT: v_mov_b32_e32 v8, s18
+; SDAG-NEXT: v_mov_b32_e32 v9, s19
+; SDAG-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-NEXT: v_mov_b32_e32 v11, s21
+; SDAG-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-NEXT: v_mov_b32_e32 v13, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: v_mov_b32_e32 v14, s24
+; SDAG-NEXT: v_mov_b32_e32 v15, s25
+; SDAG-NEXT: v_mov_b32_e32 v16, s26
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v17, s27
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -4770,45 +4770,44 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b32_e32 v16, s1
+; SDAG-NEXT: v_mov_b32_e32 v0, s1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0]
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], s0, v0 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4922,42 +4921,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -5069,42 +5067,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -5216,42 +5213,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 77d4aad5f3174..80568810e42b5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -17,24 +17,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v13, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
@@ -44,23 +44,23 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b32_e32 v16, s16
+; GISEL-NEXT: v_mov_b32_e32 v12, s16
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -120,25 +120,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <
; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -547,24 +547,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1)
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GCN-NEXT: v_mov_b32_e32 v16, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; GCN-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
-; GCN-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
-; GCN-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT: v_mov_b32_e32 v17, s16
+; GCN-NEXT: v_mov_b32_e32 v13, s16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
; GCN-NEXT: s_nop 7
-; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; GCN-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
; GCN-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -624,25 +624,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v8, s0
-; GCN-NEXT: v_mov_b32_e32 v9, s1
-; GCN-NEXT: v_mov_b32_e32 v10, s2
-; GCN-NEXT: v_mov_b32_e32 v11, s3
-; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NEXT: v_mov_b32_e32 v3, s19
-; GCN-NEXT: v_mov_b32_e32 v4, s20
-; GCN-NEXT: v_mov_b32_e32 v5, s21
-; GCN-NEXT: v_mov_b32_e32 v6, s22
-; GCN-NEXT: v_mov_b32_e32 v7, s23
+; GCN-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NEXT: v_mov_b32_e32 v11, s1
+; GCN-NEXT: v_mov_b32_e32 v12, s2
+; GCN-NEXT: v_mov_b32_e32 v13, s3
+; GCN-NEXT: v_mov_b32_e32 v2, s16
+; GCN-NEXT: v_mov_b32_e32 v3, s17
+; GCN-NEXT: v_mov_b32_e32 v4, s18
+; GCN-NEXT: v_mov_b32_e32 v5, s19
+; GCN-NEXT: v_mov_b32_e32 v6, s20
+; GCN-NEXT: v_mov_b32_e32 v7, s21
+; GCN-NEXT: v_mov_b32_e32 v8, s22
+; GCN-NEXT: v_mov_b32_e32 v9, s23
; GCN-NEXT: v_accvgpr_write_b32 a0, s24
; GCN-NEXT: v_accvgpr_write_b32 a1, s25
; GCN-NEXT: v_accvgpr_write_b32 a2, s26
; GCN-NEXT: v_accvgpr_write_b32 a3, s27
-; GCN-NEXT: v_mov_b32_e32 v12, s28
+; GCN-NEXT: v_mov_b32_e32 v0, s28
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12
+; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[10:13], v[2:9], v0
; GCN-NEXT: s_nop 7
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
@@ -855,30 +855,30 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v12, s8
-; SDAG-NEXT: v_mov_b32_e32 v13, s9
-; SDAG-NEXT: v_mov_b32_e32 v14, s10
-; SDAG-NEXT: v_mov_b32_e32 v15, s11
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v14, s8
+; SDAG-NEXT: v_mov_b32_e32 v15, s9
+; SDAG-NEXT: v_mov_b32_e32 v16, s10
+; SDAG-NEXT: v_mov_b32_e32 v17, s11
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v4, s0
-; SDAG-NEXT: v_mov_b32_e32 v5, s1
-; SDAG-NEXT: v_mov_b32_e32 v6, s2
-; SDAG-NEXT: v_mov_b32_e32 v7, s3
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v6, s0
+; SDAG-NEXT: v_mov_b32_e32 v7, s1
+; SDAG-NEXT: v_mov_b32_e32 v8, s2
+; SDAG-NEXT: v_mov_b32_e32 v9, s3
+; SDAG-NEXT: v_mov_b32_e32 v1, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
@@ -887,24 +887,24 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -964,25 +964,25 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1032,22 +1032,22 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v24, s8
-; SDAG-NEXT: v_mov_b32_e32 v25, s9
-; SDAG-NEXT: v_mov_b32_e32 v26, s10
-; SDAG-NEXT: v_mov_b32_e32 v27, s11
-; SDAG-NEXT: v_mov_b32_e32 v16, s12
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
-; SDAG-NEXT: v_mov_b32_e32 v18, s14
-; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s0
-; SDAG-NEXT: v_mov_b32_e32 v21, s1
-; SDAG-NEXT: v_mov_b32_e32 v22, s2
-; SDAG-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v26, s8
+; SDAG-NEXT: v_mov_b32_e32 v27, s9
+; SDAG-NEXT: v_mov_b32_e32 v28, s10
+; SDAG-NEXT: v_mov_b32_e32 v29, s11
+; SDAG-NEXT: v_mov_b32_e32 v18, s12
+; SDAG-NEXT: v_mov_b32_e32 v19, s13
+; SDAG-NEXT: v_mov_b32_e32 v20, s14
+; SDAG-NEXT: v_mov_b32_e32 v21, s15
+; SDAG-NEXT: v_mov_b32_e32 v22, s0
+; SDAG-NEXT: v_mov_b32_e32 v23, s1
+; SDAG-NEXT: v_mov_b32_e32 v24, s2
+; SDAG-NEXT: v_mov_b32_e32 v25, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
@@ -1397,30 +1397,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v12, s8
-; SDAG-NEXT: v_mov_b32_e32 v13, s9
-; SDAG-NEXT: v_mov_b32_e32 v14, s10
-; SDAG-NEXT: v_mov_b32_e32 v15, s11
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v14, s8
+; SDAG-NEXT: v_mov_b32_e32 v15, s9
+; SDAG-NEXT: v_mov_b32_e32 v16, s10
+; SDAG-NEXT: v_mov_b32_e32 v17, s11
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v4, s0
-; SDAG-NEXT: v_mov_b32_e32 v5, s1
-; SDAG-NEXT: v_mov_b32_e32 v6, s2
-; SDAG-NEXT: v_mov_b32_e32 v7, s3
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v6, s0
+; SDAG-NEXT: v_mov_b32_e32 v7, s1
+; SDAG-NEXT: v_mov_b32_e32 v8, s2
+; SDAG-NEXT: v_mov_b32_e32 v9, s3
+; SDAG-NEXT: v_mov_b32_e32 v1, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
@@ -1429,24 +1429,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1506,25 +1506,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1566,30 +1566,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v12, s8
-; SDAG-NEXT: v_mov_b32_e32 v13, s9
-; SDAG-NEXT: v_mov_b32_e32 v14, s10
-; SDAG-NEXT: v_mov_b32_e32 v15, s11
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v14, s8
+; SDAG-NEXT: v_mov_b32_e32 v15, s9
+; SDAG-NEXT: v_mov_b32_e32 v16, s10
+; SDAG-NEXT: v_mov_b32_e32 v17, s11
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v4, s0
-; SDAG-NEXT: v_mov_b32_e32 v5, s1
-; SDAG-NEXT: v_mov_b32_e32 v6, s2
-; SDAG-NEXT: v_mov_b32_e32 v7, s3
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v6, s0
+; SDAG-NEXT: v_mov_b32_e32 v7, s1
+; SDAG-NEXT: v_mov_b32_e32 v8, s2
+; SDAG-NEXT: v_mov_b32_e32 v9, s3
+; SDAG-NEXT: v_mov_b32_e32 v1, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
@@ -1598,24 +1598,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1675,25 +1675,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1735,30 +1735,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v12, s8
-; SDAG-NEXT: v_mov_b32_e32 v13, s9
-; SDAG-NEXT: v_mov_b32_e32 v14, s10
-; SDAG-NEXT: v_mov_b32_e32 v15, s11
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v14, s8
+; SDAG-NEXT: v_mov_b32_e32 v15, s9
+; SDAG-NEXT: v_mov_b32_e32 v16, s10
+; SDAG-NEXT: v_mov_b32_e32 v17, s11
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v4, s0
-; SDAG-NEXT: v_mov_b32_e32 v5, s1
-; SDAG-NEXT: v_mov_b32_e32 v6, s2
-; SDAG-NEXT: v_mov_b32_e32 v7, s3
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v6, s0
+; SDAG-NEXT: v_mov_b32_e32 v7, s1
+; SDAG-NEXT: v_mov_b32_e32 v8, s2
+; SDAG-NEXT: v_mov_b32_e32 v9, s3
+; SDAG-NEXT: v_mov_b32_e32 v1, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
@@ -1767,24 +1767,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1844,25 +1844,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1904,30 +1904,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v12, s8
-; SDAG-NEXT: v_mov_b32_e32 v13, s9
-; SDAG-NEXT: v_mov_b32_e32 v14, s10
-; SDAG-NEXT: v_mov_b32_e32 v15, s11
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v14, s8
+; SDAG-NEXT: v_mov_b32_e32 v15, s9
+; SDAG-NEXT: v_mov_b32_e32 v16, s10
+; SDAG-NEXT: v_mov_b32_e32 v17, s11
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v4, s0
-; SDAG-NEXT: v_mov_b32_e32 v5, s1
-; SDAG-NEXT: v_mov_b32_e32 v6, s2
-; SDAG-NEXT: v_mov_b32_e32 v7, s3
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v6, s0
+; SDAG-NEXT: v_mov_b32_e32 v7, s1
+; SDAG-NEXT: v_mov_b32_e32 v8, s2
+; SDAG-NEXT: v_mov_b32_e32 v9, s3
+; SDAG-NEXT: v_mov_b32_e32 v1, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
@@ -1936,24 +1936,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2013,25 +2013,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -2081,22 +2081,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v24, s8
-; SDAG-NEXT: v_mov_b32_e32 v25, s9
-; SDAG-NEXT: v_mov_b32_e32 v26, s10
-; SDAG-NEXT: v_mov_b32_e32 v27, s11
-; SDAG-NEXT: v_mov_b32_e32 v16, s12
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
-; SDAG-NEXT: v_mov_b32_e32 v18, s14
-; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s0
-; SDAG-NEXT: v_mov_b32_e32 v21, s1
-; SDAG-NEXT: v_mov_b32_e32 v22, s2
-; SDAG-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v26, s8
+; SDAG-NEXT: v_mov_b32_e32 v27, s9
+; SDAG-NEXT: v_mov_b32_e32 v28, s10
+; SDAG-NEXT: v_mov_b32_e32 v29, s11
+; SDAG-NEXT: v_mov_b32_e32 v18, s12
+; SDAG-NEXT: v_mov_b32_e32 v19, s13
+; SDAG-NEXT: v_mov_b32_e32 v20, s14
+; SDAG-NEXT: v_mov_b32_e32 v21, s15
+; SDAG-NEXT: v_mov_b32_e32 v22, s0
+; SDAG-NEXT: v_mov_b32_e32 v23, s1
+; SDAG-NEXT: v_mov_b32_e32 v24, s2
+; SDAG-NEXT: v_mov_b32_e32 v25, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
@@ -2454,22 +2454,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v24, s8
-; SDAG-NEXT: v_mov_b32_e32 v25, s9
-; SDAG-NEXT: v_mov_b32_e32 v26, s10
-; SDAG-NEXT: v_mov_b32_e32 v27, s11
-; SDAG-NEXT: v_mov_b32_e32 v16, s12
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
-; SDAG-NEXT: v_mov_b32_e32 v18, s14
-; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s0
-; SDAG-NEXT: v_mov_b32_e32 v21, s1
-; SDAG-NEXT: v_mov_b32_e32 v22, s2
-; SDAG-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v26, s8
+; SDAG-NEXT: v_mov_b32_e32 v27, s9
+; SDAG-NEXT: v_mov_b32_e32 v28, s10
+; SDAG-NEXT: v_mov_b32_e32 v29, s11
+; SDAG-NEXT: v_mov_b32_e32 v18, s12
+; SDAG-NEXT: v_mov_b32_e32 v19, s13
+; SDAG-NEXT: v_mov_b32_e32 v20, s14
+; SDAG-NEXT: v_mov_b32_e32 v21, s15
+; SDAG-NEXT: v_mov_b32_e32 v22, s0
+; SDAG-NEXT: v_mov_b32_e32 v23, s1
+; SDAG-NEXT: v_mov_b32_e32 v24, s2
+; SDAG-NEXT: v_mov_b32_e32 v25, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
@@ -2827,22 +2827,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v24, s8
-; SDAG-NEXT: v_mov_b32_e32 v25, s9
-; SDAG-NEXT: v_mov_b32_e32 v26, s10
-; SDAG-NEXT: v_mov_b32_e32 v27, s11
-; SDAG-NEXT: v_mov_b32_e32 v16, s12
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
-; SDAG-NEXT: v_mov_b32_e32 v18, s14
-; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s0
-; SDAG-NEXT: v_mov_b32_e32 v21, s1
-; SDAG-NEXT: v_mov_b32_e32 v22, s2
-; SDAG-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v26, s8
+; SDAG-NEXT: v_mov_b32_e32 v27, s9
+; SDAG-NEXT: v_mov_b32_e32 v28, s10
+; SDAG-NEXT: v_mov_b32_e32 v29, s11
+; SDAG-NEXT: v_mov_b32_e32 v18, s12
+; SDAG-NEXT: v_mov_b32_e32 v19, s13
+; SDAG-NEXT: v_mov_b32_e32 v20, s14
+; SDAG-NEXT: v_mov_b32_e32 v21, s15
+; SDAG-NEXT: v_mov_b32_e32 v22, s0
+; SDAG-NEXT: v_mov_b32_e32 v23, s1
+; SDAG-NEXT: v_mov_b32_e32 v24, s2
+; SDAG-NEXT: v_mov_b32_e32 v25, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
@@ -3200,22 +3200,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v24, s8
-; SDAG-NEXT: v_mov_b32_e32 v25, s9
-; SDAG-NEXT: v_mov_b32_e32 v26, s10
-; SDAG-NEXT: v_mov_b32_e32 v27, s11
-; SDAG-NEXT: v_mov_b32_e32 v16, s12
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
-; SDAG-NEXT: v_mov_b32_e32 v18, s14
-; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s0
-; SDAG-NEXT: v_mov_b32_e32 v21, s1
-; SDAG-NEXT: v_mov_b32_e32 v22, s2
-; SDAG-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v26, s8
+; SDAG-NEXT: v_mov_b32_e32 v27, s9
+; SDAG-NEXT: v_mov_b32_e32 v28, s10
+; SDAG-NEXT: v_mov_b32_e32 v29, s11
+; SDAG-NEXT: v_mov_b32_e32 v18, s12
+; SDAG-NEXT: v_mov_b32_e32 v19, s13
+; SDAG-NEXT: v_mov_b32_e32 v20, s14
+; SDAG-NEXT: v_mov_b32_e32 v21, s15
+; SDAG-NEXT: v_mov_b32_e32 v22, s0
+; SDAG-NEXT: v_mov_b32_e32 v23, s1
+; SDAG-NEXT: v_mov_b32_e32 v24, s2
+; SDAG-NEXT: v_mov_b32_e32 v25, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 9c38d7f5e2a05..5b0d2d22c3e3d 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -3565,13 +3565,13 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x
; PACKED-SDAG-LABEL: fneg_v2f32_scalar:
; PACKED-SDAG: ; %bb.0:
; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, 0
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000
; PACKED-SDAG-NEXT: s_xor_b32 s2, s2, 0x80000000
-; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; PACKED-SDAG-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; PACKED-SDAG-NEXT: s_endpgm
;
; PACKED-GISEL-LABEL: fneg_v2f32_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index a5c8f0480c776..c3164b8dd07ef 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -71,26 +71,19 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
;
; PEI-GFX90A-LABEL: name: partial_copy
; PEI-GFX90A: bb.0 (%ir-block.0):
- ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
+ ; PEI-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; PEI-GFX90A-NEXT: {{ $}}
- ; PEI-GFX90A-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
- ; PEI-GFX90A-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
- ; PEI-GFX90A-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
- ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
- ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
- ; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
+ ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
; PEI-GFX90A-NEXT: renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
; PEI-GFX90A-NEXT: renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
- ; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
- ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: S_ENDPGM 0
call void asm sideeffect "; use $0", "a" (i32 poison)
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index d48bfe0bb7f21..68ef30a90646e 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -53,31 +53,31 @@ define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s20
-; GFX942-NEXT: v_mov_b32_e32 v1, s21
-; GFX942-NEXT: v_mov_b32_e32 v2, s22
-; GFX942-NEXT: v_mov_b32_e32 v3, s23
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX942-NEXT: v_mov_b32_e32 v2, s20
+; GFX942-NEXT: v_mov_b32_e32 v3, s21
+; GFX942-NEXT: v_mov_b32_e32 v4, s22
+; GFX942-NEXT: v_mov_b32_e32 v5, s23
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v0, s16
-; GFX942-NEXT: v_mov_b32_e32 v1, s17
-; GFX942-NEXT: v_mov_b32_e32 v2, s18
-; GFX942-NEXT: v_mov_b32_e32 v3, s19
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX942-NEXT: v_mov_b32_e32 v2, s16
+; GFX942-NEXT: v_mov_b32_e32 v3, s17
+; GFX942-NEXT: v_mov_b32_e32 v4, s18
+; GFX942-NEXT: v_mov_b32_e32 v5, s19
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v0, s12
-; GFX942-NEXT: v_mov_b32_e32 v1, s13
-; GFX942-NEXT: v_mov_b32_e32 v2, s14
-; GFX942-NEXT: v_mov_b32_e32 v3, s15
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-NEXT: v_mov_b32_e32 v3, s13
+; GFX942-NEXT: v_mov_b32_e32 v4, s14
+; GFX942-NEXT: v_mov_b32_e32 v5, s15
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: v_mov_b32_e32 v1, s9
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: v_mov_b32_e32 v3, s11
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-NEXT: v_mov_b32_e32 v3, s9
+; GFX942-NEXT: v_mov_b32_e32 v4, s10
+; GFX942-NEXT: v_mov_b32_e32 v5, s11
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
store <16 x i32> %a, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
index f2fd3a8fedaaa..c035e9f9415c1 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
@@ -9,9 +9,9 @@
%asm.output = type { <16 x i32>, <8 x i32>, <5 x i32>, <4 x i32>, <16 x i32> }
; CHECK-LABEL: {{^}}illegal_eviction_assert:
-; CHECK: ; def v[4:19] v[20:27] v[0:4] v[0:3] a[0:15]
+; CHECK: ; def v[13:28] v[0:7] v[8:12] v[0:3] a[0:15]
; CHECK: ; clobber
-; CHECK: ; use v[4:19] v[20:27] v[0:4] v[0:3] a[1:16]
+; CHECK: ; use v[13:28] v[0:7] v[8:12] v[0:3] a[1:16]
define void @illegal_eviction_assert(ptr addrspace(1) %arg) #0 {
;%agpr0 = call i32 asm sideeffect "; def $0","=${a0}"()
%asm = call %asm.output asm sideeffect "; def $0 $1 $2 $3 $4","=v,=v,=v,=v,={a[0:15]}"()
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
index e8e122e26e192..bbb9df96f752e 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
@@ -59,19 +59,19 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX90A-LABEL: scalar_to_vector_v8i16:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, s3
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX90A-NEXT: v_mov_b32_e32 v3, s1
+; GFX90A-NEXT: v_mov_b32_e32 v4, s0
+; GFX90A-NEXT: v_mov_b32_e32 v5, s0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX90A-NEXT: s_endpgm
entry:
%val.1.i32 = extractelement <2 x i32> %in, i64 0
@@ -146,19 +146,19 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX90A-LABEL: scalar_to_vector_v8f16:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, s3
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s0
+; GFX90A-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX90A-NEXT: v_mov_b32_e32 v3, s1
+; GFX90A-NEXT: v_mov_b32_e32 v5, s0
+; GFX90A-NEXT: v_mov_b32_e32 v4, s0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX90A-NEXT: s_endpgm
entry:
%val.1.float = extractelement <2 x float> %in, i64 0
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
index 936118750ff10..4d864ad15b411 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
@@ -25,27 +25,27 @@ define void @shufflevector_v2i32_10_physreg_even_vgpr_pair_copy(ptr addrspace(1)
; GFX90A-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"()
@@ -214,27 +214,27 @@ define void @shufflevector_v2i32_11_physreg_even_vgpr_pair_copy(ptr addrspace(1)
; GFX90A-LABEL: shufflevector_v2i32_11_physreg_even_vgpr_pair_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: shufflevector_v2i32_11_physreg_even_vgpr_pair_copy:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v5
+; GFX940-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"()
@@ -265,31 +265,31 @@ define void @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy(ptr addrspace(
; GFX90A-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5, v6, v7
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v10, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v8, v7
+; GFX90A-NEXT: v_mov_b32_e32 v11, v4
+; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v8, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5, v6, v7
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v10, v5
+; GFX940-NEXT: v_mov_b32_e32 v9, v6
+; GFX940-NEXT: v_mov_b32_e32 v8, v7
+; GFX940-NEXT: v_mov_b32_e32 v11, v4
+; GFX940-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"()
@@ -327,31 +327,31 @@ define void @shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy(ptr addrspace(
; GFX90A-LABEL: shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5, v6, v7
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v11, v6
+; GFX90A-NEXT: v_mov_b32_e32 v10, v7
+; GFX90A-NEXT: v_mov_b32_e32 v9, v4
+; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v8, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5, v6, v7
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v3, v6
-; GFX940-NEXT: v_mov_b32_e32 v2, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v8, v5
+; GFX940-NEXT: v_mov_b32_e32 v11, v6
+; GFX940-NEXT: v_mov_b32_e32 v10, v7
+; GFX940-NEXT: v_mov_b32_e32 v9, v4
+; GFX940-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"()
@@ -746,16 +746,15 @@ define i32 @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt(p
; GFX90A-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5, v6, v7
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
-; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v8, v7
+; GFX90A-NEXT: v_mov_b32_e32 v11, v4
+; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -763,17 +762,16 @@ define i32 @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt(p
; GFX940-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v8, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5, v6, v7
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
-; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_mov_b32_e32 v10, v5
+; GFX940-NEXT: v_mov_b32_e32 v9, v6
+; GFX940-NEXT: v_mov_b32_e32 v8, v7
+; GFX940-NEXT: v_mov_b32_e32 v11, v4
+; GFX940-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v0, v6
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index d23e314b9465f..f6c357dc38b48 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -70,12 +70,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9]
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1
-; GLOBALNESS1-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane
+; GLOBALNESS1-NEXT: ; implicit-def: $vgpr57 : SGPR spill to VGPR lane
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s8, 0
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s8, 0
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s9, 1
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s9, 1
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[70:71], 1, v3
; GLOBALNESS1-NEXT: v_mov_b32_e32 v46, 0x80
; GLOBALNESS1-NEXT: s_mov_b32 s82, s16
@@ -84,7 +84,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11]
; GLOBALNESS1-NEXT: v_mov_b32_e32 v47, 0
; GLOBALNESS1-NEXT: s_mov_b32 s32, 0
-; GLOBALNESS1-NEXT: ; implicit-def: $vgpr56_vgpr57
+; GLOBALNESS1-NEXT: ; implicit-def: $vgpr58_vgpr59
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -93,24 +93,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 2
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 2
; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 3
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 3
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 4
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 5
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 4
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 5
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 6
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 7
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 6
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 7
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s70, 8
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s71, 9
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s70, 8
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s71, 9
; GLOBALNESS1-NEXT: s_branch .LBB1_4
; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_readlane_b32 s6, v59, 6
-; GLOBALNESS1-NEXT: v_readlane_b32 s7, v59, 7
+; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 6
+; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 7
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_28
; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15
@@ -120,7 +120,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS1-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS1-NEXT: v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_29
; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5
; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1
@@ -128,7 +128,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: flat_load_dword v40, v[46:47]
; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40
; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS1-NEXT: flat_load_dword v58, v[46:47]
+; GLOBALNESS1-NEXT: flat_load_dword v56, v[46:47]
; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0
; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5]
; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
@@ -186,10 +186,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[44:45], off
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s8, 10
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s9, 11
-; GLOBALNESS1-NEXT: v_readlane_b32 s4, v59, 2
-; GLOBALNESS1-NEXT: v_readlane_b32 s5, v59, 3
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s8, 10
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s9, 11
+; GLOBALNESS1-NEXT: v_readlane_b32 s4, v57, 2
+; GLOBALNESS1-NEXT: v_readlane_b32 s5, v57, 3
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13
; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i
@@ -198,7 +198,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56
; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -228,8 +228,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS1-NEXT: ; %bb.19: ; %bb3.i.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT: v_readlane_b32 s4, v59, 0
-; GLOBALNESS1-NEXT: v_readlane_b32 s5, v59, 1
+; GLOBALNESS1-NEXT: v_readlane_b32 s4, v57, 0
+; GLOBALNESS1-NEXT: v_readlane_b32 s5, v57, 1
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS1-NEXT: ; %bb.20: ; %bb6.i.i
@@ -265,7 +265,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b32 s13, s83
; GLOBALNESS1-NEXT: s_mov_b32 s14, s82
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
-; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[56:57], off
+; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[58:59], off
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[54:55]
; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[96:97]
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14
@@ -277,13 +277,13 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: .LBB1_24: ; %Flow23
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[38:39], 0x0
-; GLOBALNESS1-NEXT: v_readlane_b32 s70, v59, 8
-; GLOBALNESS1-NEXT: v_readlane_b32 s8, v59, 10
+; GLOBALNESS1-NEXT: v_readlane_b32 s70, v57, 8
+; GLOBALNESS1-NEXT: v_readlane_b32 s8, v57, 10
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT: v_readlane_b32 s71, v59, 9
+; GLOBALNESS1-NEXT: v_readlane_b32 s71, v57, 9
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS1-NEXT: s_mov_b32 s55, s7
-; GLOBALNESS1-NEXT: v_readlane_b32 s9, v59, 11
+; GLOBALNESS1-NEXT: v_readlane_b32 s9, v57, 11
; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow24
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[52:53]
@@ -291,8 +291,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2
; GLOBALNESS1-NEXT: ; %bb.26: ; %bb67.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_readlane_b32 s6, v59, 4
-; GLOBALNESS1-NEXT: v_readlane_b32 s7, v59, 5
+; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 4
+; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 5
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1
; GLOBALNESS1-NEXT: ; %bb.27: ; %bb69.i
@@ -384,12 +384,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9]
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1
-; GLOBALNESS0-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane
+; GLOBALNESS0-NEXT: ; implicit-def: $vgpr57 : SGPR spill to VGPR lane
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s8, 0
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s8, 0
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s9, 1
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s9, 1
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[84:85], 1, v3
; GLOBALNESS0-NEXT: v_mov_b32_e32 v46, 0x80
; GLOBALNESS0-NEXT: s_mov_b32 s70, s16
@@ -398,7 +398,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11]
; GLOBALNESS0-NEXT: v_mov_b32_e32 v47, 0
; GLOBALNESS0-NEXT: s_mov_b32 s32, 0
-; GLOBALNESS0-NEXT: ; implicit-def: $vgpr56_vgpr57
+; GLOBALNESS0-NEXT: ; implicit-def: $vgpr58_vgpr59
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -407,24 +407,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 2
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 2
; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 3
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 3
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 4
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 5
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 4
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 5
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 6
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 7
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 6
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 7
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s84, 8
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s85, 9
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s84, 8
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s85, 9
; GLOBALNESS0-NEXT: s_branch .LBB1_4
; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_readlane_b32 s6, v59, 6
-; GLOBALNESS0-NEXT: v_readlane_b32 s7, v59, 7
+; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 6
+; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 7
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_28
; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15
@@ -434,7 +434,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS0-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS0-NEXT: v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_29
; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5
; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1
@@ -442,7 +442,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: flat_load_dword v40, v[46:47]
; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40
; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS0-NEXT: flat_load_dword v58, v[46:47]
+; GLOBALNESS0-NEXT: flat_load_dword v56, v[46:47]
; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0
; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5]
; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
@@ -500,10 +500,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[44:45], off
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s8, 10
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s9, 11
-; GLOBALNESS0-NEXT: v_readlane_b32 s4, v59, 2
-; GLOBALNESS0-NEXT: v_readlane_b32 s5, v59, 3
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s8, 10
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s9, 11
+; GLOBALNESS0-NEXT: v_readlane_b32 s4, v57, 2
+; GLOBALNESS0-NEXT: v_readlane_b32 s5, v57, 3
; GLOBALNESS0-NEXT: s_mov_b32 s83, s55
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13
@@ -513,7 +513,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56
; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -543,8 +543,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS0-NEXT: ; %bb.19: ; %bb3.i.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT: v_readlane_b32 s4, v59, 0
-; GLOBALNESS0-NEXT: v_readlane_b32 s5, v59, 1
+; GLOBALNESS0-NEXT: v_readlane_b32 s4, v57, 0
+; GLOBALNESS0-NEXT: v_readlane_b32 s5, v57, 1
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS0-NEXT: ; %bb.20: ; %bb6.i.i
@@ -580,7 +580,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b32 s13, s71
; GLOBALNESS0-NEXT: s_mov_b32 s14, s70
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
-; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[56:57], off
+; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[58:59], off
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[54:55]
; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[96:97]
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14
@@ -591,12 +591,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_branch .LBB1_14
; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow23
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_readlane_b32 s84, v59, 8
-; GLOBALNESS0-NEXT: v_readlane_b32 s8, v59, 10
+; GLOBALNESS0-NEXT: v_readlane_b32 s84, v57, 8
+; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0
; GLOBALNESS0-NEXT: s_mov_b32 s55, s83
-; GLOBALNESS0-NEXT: v_readlane_b32 s85, v59, 9
-; GLOBALNESS0-NEXT: v_readlane_b32 s9, v59, 11
+; GLOBALNESS0-NEXT: v_readlane_b32 s85, v57, 9
+; GLOBALNESS0-NEXT: v_readlane_b32 s9, v57, 11
; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow24
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[52:53]
@@ -604,8 +604,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2
; GLOBALNESS0-NEXT: ; %bb.26: ; %bb67.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_readlane_b32 s6, v59, 4
-; GLOBALNESS0-NEXT: v_readlane_b32 s7, v59, 5
+; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 4
+; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 5
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1
; GLOBALNESS0-NEXT: ; %bb.27: ; %bb69.i
diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
index d0d1ba82dc000..b3166fa3f4548 100644
--- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
+++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
@@ -8,9 +8,8 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; CHECK-NEXT: v_mov_b32_e32 v40, v0
-; CHECK-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; CHECK-NEXT: flat_load_dword v42, v[0:1]
+; CHECK-NEXT: v_pk_mov_b32 v[44:45], 0, 0
+; CHECK-NEXT: flat_load_dword v42, v[44:45]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x8
@@ -19,48 +18,44 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v46, s6
-; CHECK-NEXT: v_mov_b32_e32 v47, s7
+; CHECK-NEXT: v_accvgpr_write_b32 a32, s6
+; CHECK-NEXT: v_accvgpr_write_b32 a33, s7
; CHECK-NEXT: s_mov_b64 s[6:7], src_private_base
; CHECK-NEXT: s_cmp_lg_u32 s64, -1
; CHECK-NEXT: s_cselect_b32 s7, s7, 0
; CHECK-NEXT: s_cselect_b32 s8, s64, 0
; CHECK-NEXT: s_add_u32 s50, s34, 48
; CHECK-NEXT: s_addc_u32 s51, s35, 0
-; CHECK-NEXT: v_pk_mov_b32 v[58:59], s[4:5], s[4:5] op_sel:[0,1]
+; CHECK-NEXT: v_pk_mov_b32 v[56:57], s[4:5], s[4:5] op_sel:[0,1]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, G at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, G at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0
; CHECK-NEXT: s_mov_b32 s6, 0
-; CHECK-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; CHECK-NEXT: v_mov_b32_e32 v57, s7
+; CHECK-NEXT: v_mov_b32_e32 v47, s7
; CHECK-NEXT: s_mov_b32 s7, s6
; CHECK-NEXT: s_mov_b32 s53, s14
-; CHECK-NEXT: v_accvgpr_write_b32 a33, v1
-; CHECK-NEXT: v_mov_b32_e32 v56, s8
-; CHECK-NEXT: v_pk_mov_b32 v[60:61], s[6:7], s[6:7] op_sel:[0,1]
+; CHECK-NEXT: v_mov_b32_e32 v46, s8
+; CHECK-NEXT: v_pk_mov_b32 v[58:59], s[6:7], s[6:7] op_sel:[0,1]
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51]
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s14, s16
-; CHECK-NEXT: v_mov_b32_e32 v31, v40
+; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_mov_b32 s33, s16
; CHECK-NEXT: s_mov_b32 s52, s15
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
-; CHECK-NEXT: v_accvgpr_write_b32 a32, v0
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61]
+; CHECK-NEXT: v_mov_b32_e32 v40, v0
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[58:59]
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
-; CHECK-NEXT: flat_load_dwordx2 v[62:63], v[58:59]
-; CHECK-NEXT: v_accvgpr_read_b32 v0, a32
-; CHECK-NEXT: v_mov_b32_e32 v44, 0
-; CHECK-NEXT: v_mov_b32_e32 v45, 0x3ff00000
-; CHECK-NEXT: v_accvgpr_read_b32 v1, a33
+; CHECK-NEXT: flat_load_dwordx2 v[60:61], v[56:57]
+; CHECK-NEXT: v_mov_b32_e32 v62, 0
+; CHECK-NEXT: v_mov_b32_e32 v63, 0x3ff00000
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51]
@@ -69,20 +64,20 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_mov_b32 s13, s52
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_mov_b32_e32 v31, v40
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[44:45]
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61]
+; CHECK-NEXT: flat_store_dwordx2 v[44:45], v[62:63]
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[58:59]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
-; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[56:57] glc
+; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[46:47] glc
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s64
; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, 0, v42
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63]
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[60:61]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[46:47]
-; CHECK-NEXT: buffer_store_dword v47, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], a[32:33]
+; CHECK-NEXT: buffer_store_dword a33, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v62, v0, s[0:3], 0 offen
; CHECK-NEXT: ; implicit-def: $vgpr4
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 2f25a93899721..fe7def8a69278 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1961,16 +1961,15 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1)
; GFX942-LABEL: shuffle_v6f16_452367:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: global_load_dwordx3 v[0:2], v[6:7], off
-; GFX942-NEXT: global_load_dword v3, v[4:5], off
+; GFX942-NEXT: global_load_dwordx3 v[4:6], v[0:1], off
+; GFX942-NEXT: global_load_dword v4, v[2:3], off
+; GFX942-NEXT: ; kill: killed $vgpr0 killed $vgpr1
+; GFX942-NEXT: ; kill: killed $vgpr2 killed $vgpr3
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v6f16_452367:
@@ -5151,16 +5150,15 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace
; GFX942-LABEL: shuffle_v6bf16_452367:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: global_load_dwordx3 v[0:2], v[6:7], off
-; GFX942-NEXT: global_load_dword v3, v[4:5], off
+; GFX942-NEXT: global_load_dwordx3 v[4:6], v[0:1], off
+; GFX942-NEXT: global_load_dword v4, v[2:3], off
+; GFX942-NEXT: ; kill: killed $vgpr0 killed $vgpr1
+; GFX942-NEXT: ; kill: killed $vgpr2 killed $vgpr3
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v6bf16_452367:
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index a401f989a2507..d8264b5a091e1 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -58,19 +58,19 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 2, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v3
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX942-NEXT: global_load_dword v2, v1, s[0:1]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB1_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dword v1, v2, s[2:3]
+; GFX942-NEXT: global_load_dword v2, v1, s[2:3]
; GFX942-NEXT: .LBB1_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX942-NEXT: global_store_dword v0, v2, s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -136,19 +136,19 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[0:1]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB3_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3]
; GFX942-NEXT: .LBB3_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -173,19 +173,19 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_and_b32_e32 v6, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, 4, v6
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 4, v6
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1]
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v6
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB4_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3]
; GFX942-NEXT: .LBB4_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -210,23 +210,23 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_and_b32_e32 v10, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v9, 5, v10
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 5, v10
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v9, s[0:1] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v9, s[0:1]
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v10
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB5_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v9, s[2:3] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v9, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[2:3] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3]
; GFX942-NEXT: .LBB5_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] offset:16
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -250,72 +250,72 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v2
+; GFX942-NEXT: v_and_b32_e32 v62, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v62
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[28:31], v1, s[0:1] offset:240
-; GFX942-NEXT: global_load_dwordx4 v[24:27], v1, s[0:1] offset:224
-; GFX942-NEXT: global_load_dwordx4 v[20:23], v1, s[0:1] offset:208
-; GFX942-NEXT: global_load_dwordx4 v[16:19], v1, s[0:1] offset:192
-; GFX942-NEXT: global_load_dwordx4 v[12:15], v1, s[0:1] offset:176
-; GFX942-NEXT: global_load_dwordx4 v[8:11], v1, s[0:1] offset:160
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v1, s[0:1] offset:144
-; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[0:1] offset:128
-; GFX942-NEXT: global_load_dwordx4 v[60:63], v1, s[0:1] offset:112
-; GFX942-NEXT: global_load_dwordx4 v[56:59], v1, s[0:1] offset:96
-; GFX942-NEXT: global_load_dwordx4 v[52:55], v1, s[0:1] offset:80
-; GFX942-NEXT: global_load_dwordx4 v[48:51], v1, s[0:1] offset:64
-; GFX942-NEXT: global_load_dwordx4 v[44:47], v1, s[0:1] offset:48
-; GFX942-NEXT: global_load_dwordx4 v[40:43], v1, s[0:1] offset:32
-; GFX942-NEXT: global_load_dwordx4 v[36:39], v1, s[0:1] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[32:35], v1, s[0:1]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2
+; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[0:1] offset:240
+; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[0:1] offset:224
+; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[0:1] offset:208
+; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[0:1] offset:192
+; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[0:1] offset:176
+; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[0:1] offset:160
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1] offset:128
+; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[0:1] offset:112
+; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[0:1] offset:96
+; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[0:1] offset:80
+; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[0:1] offset:64
+; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[0:1] offset:48
+; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[0:1] offset:32
+; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[0:1] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[0:1]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v62
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB6_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx4 v[28:31], v1, s[2:3] offset:240
-; GFX942-NEXT: global_load_dwordx4 v[24:27], v1, s[2:3] offset:224
-; GFX942-NEXT: global_load_dwordx4 v[20:23], v1, s[2:3] offset:208
-; GFX942-NEXT: global_load_dwordx4 v[16:19], v1, s[2:3] offset:192
-; GFX942-NEXT: global_load_dwordx4 v[12:15], v1, s[2:3] offset:176
-; GFX942-NEXT: global_load_dwordx4 v[8:11], v1, s[2:3] offset:160
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3] offset:144
-; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[2:3] offset:128
-; GFX942-NEXT: global_load_dwordx4 v[60:63], v1, s[2:3] offset:112
-; GFX942-NEXT: global_load_dwordx4 v[56:59], v1, s[2:3] offset:96
-; GFX942-NEXT: global_load_dwordx4 v[52:55], v1, s[2:3] offset:80
-; GFX942-NEXT: global_load_dwordx4 v[48:51], v1, s[2:3] offset:64
-; GFX942-NEXT: global_load_dwordx4 v[44:47], v1, s[2:3] offset:48
-; GFX942-NEXT: global_load_dwordx4 v[40:43], v1, s[2:3] offset:32
-; GFX942-NEXT: global_load_dwordx4 v[36:39], v1, s[2:3] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[32:35], v1, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[2:3] offset:240
+; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[2:3] offset:224
+; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[2:3] offset:208
+; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[2:3] offset:192
+; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[2:3] offset:176
+; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[2:3] offset:160
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[2:3] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3] offset:128
+; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[2:3] offset:112
+; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[2:3] offset:96
+; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[2:3] offset:80
+; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[2:3] offset:64
+; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[2:3] offset:48
+; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[2:3] offset:32
+; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[2:3] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[2:3]
; GFX942-NEXT: .LBB6_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[60:63], s[6:7] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] offset:112
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[56:59], s[6:7] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, v[58:61], s[6:7] offset:96
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[52:55], s[6:7] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, v[54:57], s[6:7] offset:80
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[48:51], s[6:7] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, v[50:53], s[6:7] offset:64
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[44:47], s[6:7] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, v[46:49], s[6:7] offset:48
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[40:43], s[6:7] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, v[42:45], s[6:7] offset:32
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[36:39], s[6:7] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, v[38:41], s[6:7] offset:16
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[32:35], s[6:7]
-; GFX942-NEXT: global_store_dwordx4 v0, v[28:31], s[6:7] offset:240
-; GFX942-NEXT: global_store_dwordx4 v0, v[24:27], s[6:7] offset:224
-; GFX942-NEXT: global_store_dwordx4 v0, v[20:23], s[6:7] offset:208
-; GFX942-NEXT: global_store_dwordx4 v0, v[16:19], s[6:7] offset:192
-; GFX942-NEXT: global_store_dwordx4 v0, v[12:15], s[6:7] offset:176
-; GFX942-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7] offset:160
-; GFX942-NEXT: global_store_dwordx4 v0, v[4:7], s[6:7] offset:144
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] offset:128
+; GFX942-NEXT: global_store_dwordx4 v0, v[34:37], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v0, v[30:33], s[6:7] offset:240
+; GFX942-NEXT: global_store_dwordx4 v0, v[26:29], s[6:7] offset:224
+; GFX942-NEXT: global_store_dwordx4 v0, v[22:25], s[6:7] offset:208
+; GFX942-NEXT: global_store_dwordx4 v0, v[18:21], s[6:7] offset:192
+; GFX942-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7] offset:176
+; GFX942-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] offset:160
+; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] offset:144
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7] offset:128
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -391,17 +391,17 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-LABEL: v8i8_phi_chain:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v2
-; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v2
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB8_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v2
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[10:11]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
@@ -410,14 +410,14 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX942-NEXT: s_cbranch_execz .LBB8_4
; GFX942-NEXT: ; %bb.3: ; %bb.2
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[12:13]
; GFX942-NEXT: .LBB8_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -447,38 +447,38 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa
; GFX942-LABEL: v8i8_phi_zeroinit:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, 3, v4
-; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v4
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v5, s[8:9]
-; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9]
+; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB9_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v4
+; GFX942-NEXT: global_load_dwordx2 v[4:5], v1, s[10:11]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX942-NEXT: .LBB9_2: ; %Flow
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX942-NEXT: s_cbranch_execz .LBB9_4
; GFX942-NEXT: ; %bb.3: ; %bb.2
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13]
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[12:13]
; GFX942-NEXT: .LBB9_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[14:15]
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_store_dwordx2 v0, v[4:5], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -617,30 +617,30 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
; GFX942-LABEL: v8i8_multi_block:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v5, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v6, 3, v5
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v5
+; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v3
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v6, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB11_4
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v6, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v5
+; GFX942-NEXT: global_load_dwordx2 v[6:7], v4, s[10:11]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v3
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB11_3
; GFX942-NEXT: ; %bb.2: ; %bb.2
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[12:13]
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[12:13]
; GFX942-NEXT: .LBB11_3: ; %Flow
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: .LBB11_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[14:15]
+; GFX942-NEXT: global_store_dwordx2 v2, v[6:7], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -859,15 +859,15 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB14_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[10:11]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[10:11]
; GFX942-NEXT: .LBB14_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[14:15], 0x0
@@ -878,9 +878,9 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 6
-; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[12:13]
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[12:13]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -909,15 +909,15 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[36:43], s[4:5], 0x24
; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[36:37]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[36:37]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB15_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[38:39]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[38:39]
; GFX942-NEXT: .LBB15_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_load_dwordx16 s[16:31], s[42:43], 0x0
@@ -957,18 +957,18 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v2, a[28:31], s[40:41] offset:112
-; GFX942-NEXT: global_store_dwordx4 v2, a[24:27], s[40:41] offset:96
-; GFX942-NEXT: global_store_dwordx4 v2, a[20:23], s[40:41] offset:80
-; GFX942-NEXT: global_store_dwordx4 v2, a[16:19], s[40:41] offset:64
-; GFX942-NEXT: global_store_dwordx4 v2, a[12:15], s[40:41] offset:48
-; GFX942-NEXT: global_store_dwordx4 v2, a[8:11], s[40:41] offset:32
-; GFX942-NEXT: global_store_dwordx4 v2, a[4:7], s[40:41] offset:16
-; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[40:41]
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[40:41] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[40:41] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[40:41] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[40:41] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[40:41] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[40:41] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[40:41] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[40:41]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
>From 5b041ccc9d1d15aa14c197db6d0b14b4e89ae33a Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 18 Jul 2025 09:48:22 -0700
Subject: [PATCH 2/5] Update lit test
Change-Id: I70273f7fca71635185339c3d7ab038ce073664ca
---
.../bad-agpr-vgpr-regalloc-priority.mir | 24 +++++++++----------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir b/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
index 1a457c94778fd..9241a23fca433 100644
--- a/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
+++ b/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
@@ -38,20 +38,20 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: early-clobber renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: ("amdgpu-noclobber" load (s128), addrspace 1)
- ; CHECK-NEXT: renamable $vgpr4 = V_MOV_B32_e32 1065353216, implicit $exec
- ; CHECK-NEXT: renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: renamable $vgpr6 = V_MOV_B32_e32 1073741824, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr4 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr1 = V_MOV_B32_e32 1073741824, implicit $exec
; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $vgpr1 = COPY renamable $agpr1
- ; CHECK-NEXT: renamable $vgpr0 = COPY renamable $agpr0
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $vgpr3 = COPY renamable $agpr1
- ; CHECK-NEXT: renamable $vgpr2 = COPY killed renamable $agpr0
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr4, killed $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr0, $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr6 = COPY renamable $agpr1
+ ; CHECK-NEXT: renamable $vgpr5 = COPY renamable $agpr0
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr0, $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr8 = COPY renamable $agpr1
+ ; CHECK-NEXT: renamable $vgpr7 = COPY killed renamable $agpr0
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr5_vgpr6_vgpr7_vgpr8
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr5, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr4, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: ("amdgpu-noclobber" load (s128), addrspace 1)
>From 8751d00bf7e6c2a723107142a96a1eb69a173a28 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 24 Jul 2025 16:10:37 -0700
Subject: [PATCH 3/5] Add test demonstrating regsize allocation
Change-Id: Ie68127a74fc32768690309419b1d393829125482
---
.../AMDGPU/large-avgpr-assign-lasr.mir | 94 +++++++++++++++++++
1 file changed, 94 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/large-avgpr-assign-lasr.mir
diff --git a/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-lasr.mir b/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-lasr.mir
new file mode 100644
index 0000000000000..58e9b0a1aedd4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-lasr.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-regalloc -greedy-regclass-priority-trumps-globalness=1 -start-after=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck %s
+
+--- |
+ define void @temp_vgpr_to_agpr_should_not_undo_split_with_remat() #0 {
+ entry:
+ unreachable
+ }
+
+ attributes #0 = { "amdgpu-agpr-alloc"="0,0" }
+...
+
+
+---
+name: temp_vgpr_to_agpr_should_not_undo_split_with_remat
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ workGroupIDX: { reg: '$sgpr6' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+ workItemIDX: { reg: '$vgpr0' }
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+ ; CHECK-LABEL: name: temp_vgpr_to_agpr_should_not_undo_split_with_remat
+ ; CHECK: liveins: $vgpr0, $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+ ; CHECK-NEXT: dead renamable $vgpr1 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr1 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr3 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr4 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr5 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr6 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr8 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr9 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr10 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr12 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr13 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr14 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr15 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr17 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr18 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr19 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr20 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr21 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr22 = IMPLICIT_DEF
+ ; CHECK-NEXT: KILL killed renamable $vgpr2, killed renamable $vgpr3, killed renamable $vgpr4, killed renamable $vgpr5, killed renamable $vgpr6, killed renamable $vgpr7, killed renamable $vgpr8, killed renamable $vgpr9, killed renamable $vgpr10, killed renamable $vgpr11, killed renamable $vgpr12, killed renamable $vgpr13, killed renamable $vgpr14, killed renamable $vgpr15, killed renamable $vgpr16
+ ; CHECK-NEXT: S_NOP 0, implicit-def renamable $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38
+ ; CHECK-NEXT: S_NOP 0, implicit-def renamable $vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54
+ ; CHECK-NEXT: KILL killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr17, killed renamable $vgpr18, killed renamable $vgpr19, killed renamable $vgpr20, killed renamable $vgpr21, killed renamable $vgpr22
+ ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, implicit killed renamable $vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54
+ ; CHECK-NEXT: S_ENDPGM 0
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = IMPLICIT_DEF
+ %4:vgpr_32 = IMPLICIT_DEF
+ %5:vgpr_32 = IMPLICIT_DEF
+ %6:vgpr_32 = IMPLICIT_DEF
+ %7:vgpr_32 = IMPLICIT_DEF
+ %8:vgpr_32 = IMPLICIT_DEF
+ %9:vgpr_32 = IMPLICIT_DEF
+ %10:vgpr_32 = IMPLICIT_DEF
+ %11:vgpr_32 = IMPLICIT_DEF
+ %12:vgpr_32 = IMPLICIT_DEF
+ %13:vgpr_32 = IMPLICIT_DEF
+ %14:vgpr_32 = IMPLICIT_DEF
+ %15:vgpr_32 = IMPLICIT_DEF
+ %16:vgpr_32 = IMPLICIT_DEF
+ %17:vgpr_32 = IMPLICIT_DEF
+ %18:vgpr_32 = IMPLICIT_DEF
+ %19:vgpr_32 = IMPLICIT_DEF
+ %20:vgpr_32 = IMPLICIT_DEF
+ %21:vgpr_32 = IMPLICIT_DEF
+ %22:vgpr_32 = IMPLICIT_DEF
+ %23:vgpr_32 = IMPLICIT_DEF
+ KILL %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16, %17
+ S_NOP 0, implicit-def %50:av_512
+ S_NOP 0, implicit-def %51:av_512
+ KILL %1, %2, %18, %19, %20, %21, %22, %23
+ S_NOP 0, implicit %50, implicit %51
+ S_ENDPGM 0
+...
>From 7c8829ea875c8f5196c8c5b250f00d87aa026f3f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 25 Jul 2025 21:52:43 +0900
Subject: [PATCH 4/5] Fix test file name
---
.../{large-avgpr-assign-lasr.mir => large-avgpr-assign-last.mir} | 0
1 file changed, 0 insertions(+), 0 deletions(-)
rename llvm/test/CodeGen/AMDGPU/{large-avgpr-assign-lasr.mir => large-avgpr-assign-last.mir} (100%)
diff --git a/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-lasr.mir b/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir
similarity index 100%
rename from llvm/test/CodeGen/AMDGPU/large-avgpr-assign-lasr.mir
rename to llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir
>From 39ea0ec85329bfad5b75feb796c7634e34a95966 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 25 Jul 2025 12:06:42 -0700
Subject: [PATCH 5/5] Update lit tests
Change-Id: If4c7179cd642c38a7c78d545347b8c0e011a0cbf
---
llvm/test/CodeGen/AMDGPU/bf16.ll | 119 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 64 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll | 1234 ++++++++++-------
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 756 +++++-----
.../AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll | 57 +-
llvm/test/CodeGen/AMDGPU/spill-agpr.ll | 38 +-
6 files changed, 1229 insertions(+), 1039 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 7859fcdfe8a70..52e697cae9fe5 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -468,15 +468,28 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_load_global_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v9, v1
-; GFX9-NEXT: v_mov_b32_e32 v8, v0
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_load_global_v16bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v9, v1
+; GFX900-NEXT: v_mov_b32_e32 v8, v0
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_load_global_v16bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
+; GFX950-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX950-NEXT: s_waitcnt vmcnt(1)
+; GFX950-NEXT: v_mov_b32_e32 v0, v8
+; GFX950-NEXT: v_mov_b32_e32 v1, v9
+; GFX950-NEXT: v_mov_b32_e32 v2, v10
+; GFX950-NEXT: v_mov_b32_e32 v3, v11
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v16bf16:
; GFX10: ; %bb.0:
@@ -619,17 +632,32 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_load_global_v32bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v17, v1
-; GFX9-NEXT: v_mov_b32_e32 v16, v0
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_load_global_v32bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v17, v1
+; GFX900-NEXT: v_mov_b32_e32 v16, v0
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
+; GFX900-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
+; GFX900-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_load_global_v32bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dwordx4 v[16:19], v[0:1], off
+; GFX950-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX950-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
+; GFX950-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX950-NEXT: s_waitcnt vmcnt(3)
+; GFX950-NEXT: v_mov_b32_e32 v0, v16
+; GFX950-NEXT: v_mov_b32_e32 v1, v17
+; GFX950-NEXT: v_mov_b32_e32 v2, v18
+; GFX950-NEXT: v_mov_b32_e32 v3, v19
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v32bf16:
; GFX10: ; %bb.0:
@@ -877,22 +905,41 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_load_global_v64bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v29, v1
-; GFX9-NEXT: v_mov_b32_e32 v28, v0
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[28:29], off
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48
-; GFX9-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64
-; GFX9-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80
-; GFX9-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_load_global_v64bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v29, v1
+; GFX900-NEXT: v_mov_b32_e32 v28, v0
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v[28:29], off
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16
+; GFX900-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32
+; GFX900-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48
+; GFX900-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64
+; GFX900-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80
+; GFX900-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_load_global_v64bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dwordx4 v[32:35], v[0:1], off
+; GFX950-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX950-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
+; GFX950-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX950-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:64
+; GFX950-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:80
+; GFX950-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:96
+; GFX950-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:112
+; GFX950-NEXT: s_waitcnt vmcnt(7)
+; GFX950-NEXT: v_mov_b32_e32 v0, v32
+; GFX950-NEXT: v_mov_b32_e32 v1, v33
+; GFX950-NEXT: v_mov_b32_e32 v2, v34
+; GFX950-NEXT: v_mov_b32_e32 v3, v35
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v64bf16:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 780c7e93f80d0..572b72737fece 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -443,10 +443,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: v_mov_b32_e32 v2, s10
; GFX90A-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s11
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v3, s11
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
@@ -457,7 +457,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
@@ -471,10 +471,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: v_mov_b32_e32 v2, s10
; GFX942-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
@@ -485,7 +485,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
@@ -898,20 +898,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0x3ff00000
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x3ff00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
@@ -925,20 +925,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, 0x3ff00000
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x3ff00000
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s2
-; GFX942-NEXT: v_mov_b32_e32 v1, s3
+; GFX942-NEXT: v_mov_b32_e32 v2, s2
+; GFX942-NEXT: v_mov_b32_e32 v3, s3
; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
@@ -957,21 +957,21 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0x405ec000
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x405ec000
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a1
; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a1
; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a1
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
@@ -984,21 +984,21 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_mov_b32_e32 v2, 0x405ec000
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x405ec000
; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s2
-; GFX942-NEXT: v_mov_b32_e32 v1, s3
+; GFX942-NEXT: v_mov_b32_e32 v2, s2
+; GFX942-NEXT: v_mov_b32_e32 v3, s3
; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1
; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a5, a1
; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a7, a1
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index 92af34f25a1a6..7d85d3439eed9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -35,26 +35,26 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32>, <4 x i3
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_i32_16x16x32i8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -77,26 +77,47 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_i32_16x16x32i8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -118,6 +139,27 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -281,26 +323,26 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -323,26 +365,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -364,6 +427,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -372,26 +456,26 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -414,26 +498,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -455,6 +560,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -463,26 +589,26 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -505,26 +631,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -546,6 +693,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -554,26 +722,26 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -596,26 +764,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -637,6 +826,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -1269,20 +1479,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1291,18 +1501,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1332,20 +1542,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1354,18 +1564,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1681,20 +1891,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1703,18 +1913,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1744,20 +1954,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1766,18 +1976,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -2093,23 +2303,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2119,21 +2329,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2197,23 +2407,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2223,21 +2433,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2309,16 +2519,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -2327,7 +2537,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -2461,16 +2671,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -2479,7 +2689,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
@@ -2619,23 +2829,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2645,21 +2855,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2723,23 +2933,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2749,21 +2959,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2834,23 +3044,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -2860,21 +3070,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -2938,23 +3148,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -2964,21 +3174,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -3049,23 +3259,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3075,21 +3285,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3153,23 +3363,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3179,21 +3389,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3264,23 +3474,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3290,21 +3500,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3368,23 +3578,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3394,21 +3604,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3480,16 +3690,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3498,7 +3708,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -3632,16 +3842,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3650,7 +3860,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
@@ -3791,16 +4001,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3809,7 +4019,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -3943,16 +4153,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3961,7 +4171,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
@@ -4102,16 +4312,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4120,7 +4330,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -4254,16 +4464,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4272,7 +4482,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
@@ -4413,16 +4623,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4431,7 +4641,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -4565,16 +4775,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4583,7 +4793,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 13e63a5a1e851..21465beb21de7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -518,9 +518,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[16:17], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -542,42 +542,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[18:19], 0
-; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
; HEURRC-NEXT: v_mov_b32_e32 v0, s20
; HEURRC-NEXT: v_mov_b32_e32 v1, s21
; HEURRC-NEXT: v_mov_b32_e32 v2, s22
; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: v_mov_b32_e32 v9, s17
-; HEURRC-NEXT: v_mov_b32_e32 v10, s18
-; HEURRC-NEXT: v_mov_b32_e32 v11, s19
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
; HEURRC-NEXT: s_nop 4
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -585,9 +585,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[48:49], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
@@ -601,43 +601,43 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[50:51], 0
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s16
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b32_e32 v48, s16
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s17
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s18
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s19
+; VGPRRC-NEXT: v_mov_b32_e32 v49, s17
+; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
+; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16:
@@ -900,9 +900,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[16:17], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -924,42 +924,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[18:19], 0
-; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; HEURRC-NEXT: v_mov_b32_e32 v0, s20
; HEURRC-NEXT: v_mov_b32_e32 v1, s21
; HEURRC-NEXT: v_mov_b32_e32 v2, s22
; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: v_mov_b32_e32 v9, s17
-; HEURRC-NEXT: v_mov_b32_e32 v10, s18
-; HEURRC-NEXT: v_mov_b32_e32 v11, s19
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
; HEURRC-NEXT: s_nop 4
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -967,9 +967,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[48:49], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
@@ -983,43 +983,43 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[50:51], 0
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s16
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b32_e32 v48, s16
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s17
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s18
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s19
+; VGPRRC-NEXT: v_mov_b32_e32 v49, s17
+; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
+; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__flags:
@@ -1623,7 +1623,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v12, 0
+; HEURRC-NEXT: v_mov_b32_e32 v8, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -1645,40 +1645,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
-; HEURRC-NEXT: v_mov_b32_e32 v8, s20
-; HEURRC-NEXT: v_mov_b32_e32 v9, s21
+; HEURRC-NEXT: v_mov_b32_e32 v10, s20
+; HEURRC-NEXT: v_mov_b32_e32 v11, s21
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
-; HEURRC-NEXT: v_mov_b32_e32 v10, s22
-; HEURRC-NEXT: v_mov_b32_e32 v11, s23
+; HEURRC-NEXT: v_mov_b32_e32 v12, s22
+; HEURRC-NEXT: v_mov_b32_e32 v13, s23
; HEURRC-NEXT: v_mov_b32_e32 v0, s16
; HEURRC-NEXT: v_mov_b32_e32 v1, s17
; HEURRC-NEXT: v_mov_b32_e32 v2, s18
; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -1687,7 +1687,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v44, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
@@ -1701,41 +1701,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s21
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s23
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: v_mov_b32_e32 v44, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v45, s23
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[42:45], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 2
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
@@ -1987,7 +1987,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v12, 0
+; HEURRC-NEXT: v_mov_b32_e32 v8, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -2009,40 +2009,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
-; HEURRC-NEXT: v_mov_b32_e32 v8, s20
-; HEURRC-NEXT: v_mov_b32_e32 v9, s21
+; HEURRC-NEXT: v_mov_b32_e32 v10, s20
+; HEURRC-NEXT: v_mov_b32_e32 v11, s21
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT: v_mov_b32_e32 v10, s22
-; HEURRC-NEXT: v_mov_b32_e32 v11, s23
+; HEURRC-NEXT: v_mov_b32_e32 v12, s22
+; HEURRC-NEXT: v_mov_b32_e32 v13, s23
; HEURRC-NEXT: v_mov_b32_e32 v0, s16
; HEURRC-NEXT: v_mov_b32_e32 v1, s17
; HEURRC-NEXT: v_mov_b32_e32 v2, s18
; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -2051,7 +2051,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v44, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
@@ -2065,41 +2065,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s21
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s23
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: v_mov_b32_e32 v44, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v45, s23
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[42:45], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 2
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
@@ -2827,24 +2827,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: v_mov_b32_e32 v4, s12
-; HEURRC-NEXT: v_mov_b32_e32 v5, s13
-; HEURRC-NEXT: v_mov_b32_e32 v6, s14
-; HEURRC-NEXT: v_mov_b32_e32 v7, s15
+; HEURRC-NEXT: v_mov_b32_e32 v2, s8
+; HEURRC-NEXT: v_mov_b32_e32 v3, s9
+; HEURRC-NEXT: v_mov_b32_e32 v4, s10
+; HEURRC-NEXT: v_mov_b32_e32 v5, s11
+; HEURRC-NEXT: v_mov_b32_e32 v6, s12
+; HEURRC-NEXT: v_mov_b32_e32 v7, s13
+; HEURRC-NEXT: v_mov_b32_e32 v8, s14
+; HEURRC-NEXT: v_mov_b32_e32 v9, s15
; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3]
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2852,24 +2852,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v4, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
-; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
-; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
-; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
-; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
-; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
-; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
-; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
-; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
-; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
-; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
+; VGPRRC-NEXT: v_mov_b32_e32 v6, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v7, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v8, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v9, s11
+; VGPRRC-NEXT: v_mov_b32_e32 v10, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v11, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v12, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v13, s15
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s0
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s1
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s2
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s3
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3]
; VGPRRC-NEXT: s_nop 7
-; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
; AGPR: ; %bb.0:
@@ -2976,24 +2976,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: v_mov_b32_e32 v4, s12
-; HEURRC-NEXT: v_mov_b32_e32 v5, s13
-; HEURRC-NEXT: v_mov_b32_e32 v6, s14
-; HEURRC-NEXT: v_mov_b32_e32 v7, s15
+; HEURRC-NEXT: v_mov_b32_e32 v2, s8
+; HEURRC-NEXT: v_mov_b32_e32 v3, s9
+; HEURRC-NEXT: v_mov_b32_e32 v4, s10
+; HEURRC-NEXT: v_mov_b32_e32 v5, s11
+; HEURRC-NEXT: v_mov_b32_e32 v6, s12
+; HEURRC-NEXT: v_mov_b32_e32 v7, s13
+; HEURRC-NEXT: v_mov_b32_e32 v8, s14
+; HEURRC-NEXT: v_mov_b32_e32 v9, s15
; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -3001,24 +3001,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v4, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
-; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
-; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
-; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
-; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
-; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
-; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
-; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
-; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
-; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
-; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
+; VGPRRC-NEXT: v_mov_b32_e32 v6, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v7, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v8, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v9, s11
+; VGPRRC-NEXT: v_mov_b32_e32 v10, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v11, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v12, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v13, s15
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s0
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s1
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s2
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s3
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
; VGPRRC-NEXT: s_nop 7
-; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
; AGPR: ; %bb.0:
@@ -3212,19 +3212,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s24
-; HEURRC-NEXT: v_mov_b32_e32 v1, s25
-; HEURRC-NEXT: v_mov_b32_e32 v2, s26
-; HEURRC-NEXT: v_mov_b32_e32 v3, s27
+; HEURRC-NEXT: v_mov_b32_e32 v8, s24
+; HEURRC-NEXT: v_mov_b32_e32 v9, s25
+; HEURRC-NEXT: v_mov_b32_e32 v10, s26
+; HEURRC-NEXT: v_mov_b32_e32 v11, s27
; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v4, s28
-; HEURRC-NEXT: v_mov_b32_e32 v5, s29
-; HEURRC-NEXT: v_mov_b32_e32 v6, s30
-; HEURRC-NEXT: v_mov_b32_e32 v7, s31
+; HEURRC-NEXT: v_mov_b32_e32 v12, s28
+; HEURRC-NEXT: v_mov_b32_e32 v13, s29
+; HEURRC-NEXT: v_mov_b32_e32 v14, s30
+; HEURRC-NEXT: v_mov_b32_e32 v15, s31
; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3240,44 +3240,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mov_b32_e32 v9, s17
+; HEURRC-NEXT: v_mov_b32_e32 v10, s18
+; HEURRC-NEXT: v_mov_b32_e32 v11, s19
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: v_mov_b32_e32 v8, s20
+; HEURRC-NEXT: v_mov_b32_e32 v9, s21
+; HEURRC-NEXT: v_mov_b32_e32 v10, s22
+; HEURRC-NEXT: v_mov_b32_e32 v11, s23
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -3285,19 +3283,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v32, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v33, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s27
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s27
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v36, s28
-; VGPRRC-NEXT: v_mov_b32_e32 v37, s29
-; VGPRRC-NEXT: v_mov_b32_e32 v38, s30
-; VGPRRC-NEXT: v_mov_b32_e32 v39, s31
+; VGPRRC-NEXT: v_mov_b32_e32 v44, s28
+; VGPRRC-NEXT: v_mov_b32_e32 v45, s29
+; VGPRRC-NEXT: v_mov_b32_e32 v46, s30
+; VGPRRC-NEXT: v_mov_b32_e32 v47, s31
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -3305,45 +3303,45 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0
; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[40:43], v[44:47], v[0:15]
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 3
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s16
; VGPRRC-NEXT: v_mov_b32_e32 v1, s17
; VGPRRC-NEXT: v_mov_b32_e32 v2, s18
; VGPRRC-NEXT: v_mov_b32_e32 v3, s19
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8:
@@ -3622,19 +3620,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s24
-; HEURRC-NEXT: v_mov_b32_e32 v1, s25
-; HEURRC-NEXT: v_mov_b32_e32 v2, s26
-; HEURRC-NEXT: v_mov_b32_e32 v3, s27
+; HEURRC-NEXT: v_mov_b32_e32 v8, s24
+; HEURRC-NEXT: v_mov_b32_e32 v9, s25
+; HEURRC-NEXT: v_mov_b32_e32 v10, s26
+; HEURRC-NEXT: v_mov_b32_e32 v11, s27
; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v4, s28
-; HEURRC-NEXT: v_mov_b32_e32 v5, s29
-; HEURRC-NEXT: v_mov_b32_e32 v6, s30
-; HEURRC-NEXT: v_mov_b32_e32 v7, s31
+; HEURRC-NEXT: v_mov_b32_e32 v12, s28
+; HEURRC-NEXT: v_mov_b32_e32 v13, s29
+; HEURRC-NEXT: v_mov_b32_e32 v14, s30
+; HEURRC-NEXT: v_mov_b32_e32 v15, s31
; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3650,44 +3648,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mov_b32_e32 v9, s17
+; HEURRC-NEXT: v_mov_b32_e32 v10, s18
+; HEURRC-NEXT: v_mov_b32_e32 v11, s19
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: v_mov_b32_e32 v8, s20
+; HEURRC-NEXT: v_mov_b32_e32 v9, s21
+; HEURRC-NEXT: v_mov_b32_e32 v10, s22
+; HEURRC-NEXT: v_mov_b32_e32 v11, s23
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -3695,19 +3691,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v32, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v33, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s27
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s27
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v36, s28
-; VGPRRC-NEXT: v_mov_b32_e32 v37, s29
-; VGPRRC-NEXT: v_mov_b32_e32 v38, s30
-; VGPRRC-NEXT: v_mov_b32_e32 v39, s31
+; VGPRRC-NEXT: v_mov_b32_e32 v44, s28
+; VGPRRC-NEXT: v_mov_b32_e32 v45, s29
+; VGPRRC-NEXT: v_mov_b32_e32 v46, s30
+; VGPRRC-NEXT: v_mov_b32_e32 v47, s31
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -3715,45 +3711,45 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0
; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[40:43], v[44:47], v[0:15] cbsz:2 abid:3 blgp:1
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 3
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s16
; VGPRRC-NEXT: v_mov_b32_e32 v1, s17
; VGPRRC-NEXT: v_mov_b32_e32 v2, s18
; VGPRRC-NEXT: v_mov_b32_e32 v3, s19
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
@@ -4375,17 +4371,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v2, s20
+; HEURRC-NEXT: v_mov_b32_e32 v3, s21
+; HEURRC-NEXT: v_mov_b32_e32 v4, s22
+; HEURRC-NEXT: v_mov_b32_e32 v5, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v4, s24
-; HEURRC-NEXT: v_mov_b32_e32 v5, s25
-; HEURRC-NEXT: v_mov_b32_e32 v6, s26
-; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_mov_b32_e32 v6, s24
+; HEURRC-NEXT: v_mov_b32_e32 v7, s25
+; HEURRC-NEXT: v_mov_b32_e32 v8, s26
+; HEURRC-NEXT: v_mov_b32_e32 v9, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
@@ -4404,41 +4400,41 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31]
+; HEURRC-NEXT: v_mov_b32_e32 v2, s20
+; HEURRC-NEXT: v_mov_b32_e32 v3, s21
+; HEURRC-NEXT: v_mov_b32_e32 v4, s22
+; HEURRC-NEXT: v_mov_b32_e32 v5, s23
+; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s16
+; HEURRC-NEXT: v_mov_b32_e32 v3, s17
+; HEURRC-NEXT: v_mov_b32_e32 v4, s18
+; HEURRC-NEXT: v_mov_b32_e32 v5, s19
+; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s12
-; HEURRC-NEXT: v_mov_b32_e32 v1, s13
-; HEURRC-NEXT: v_mov_b32_e32 v2, s14
-; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s12
+; HEURRC-NEXT: v_mov_b32_e32 v3, s13
+; HEURRC-NEXT: v_mov_b32_e32 v4, s14
+; HEURRC-NEXT: v_mov_b32_e32 v5, s15
+; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s8
+; HEURRC-NEXT: v_mov_b32_e32 v3, s9
+; HEURRC-NEXT: v_mov_b32_e32 v4, s10
+; HEURRC-NEXT: v_mov_b32_e32 v5, s11
+; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -4446,17 +4442,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v32, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v32, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v33, s21
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT: v_mov_b32_e32 v34, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v35, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v36, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v37, s23
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b32_e32 v36, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v37, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v38, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v39, s27
+; VGPRRC-NEXT: v_mov_b32_e32 v38, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v39, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s27
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
@@ -4467,42 +4463,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31]
; VGPRRC-NEXT: s_nop 6
; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
; VGPRRC-NEXT: v_mov_b32_e32 v18, s22
; VGPRRC-NEXT: v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
@@ -4774,17 +4770,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v2, s20
+; HEURRC-NEXT: v_mov_b32_e32 v3, s21
+; HEURRC-NEXT: v_mov_b32_e32 v4, s22
+; HEURRC-NEXT: v_mov_b32_e32 v5, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v4, s24
-; HEURRC-NEXT: v_mov_b32_e32 v5, s25
-; HEURRC-NEXT: v_mov_b32_e32 v6, s26
-; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_mov_b32_e32 v6, s24
+; HEURRC-NEXT: v_mov_b32_e32 v7, s25
+; HEURRC-NEXT: v_mov_b32_e32 v8, s26
+; HEURRC-NEXT: v_mov_b32_e32 v9, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
@@ -4803,41 +4799,41 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT: v_mov_b32_e32 v2, s20
+; HEURRC-NEXT: v_mov_b32_e32 v3, s21
+; HEURRC-NEXT: v_mov_b32_e32 v4, s22
+; HEURRC-NEXT: v_mov_b32_e32 v5, s23
+; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s16
+; HEURRC-NEXT: v_mov_b32_e32 v3, s17
+; HEURRC-NEXT: v_mov_b32_e32 v4, s18
+; HEURRC-NEXT: v_mov_b32_e32 v5, s19
+; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s12
-; HEURRC-NEXT: v_mov_b32_e32 v1, s13
-; HEURRC-NEXT: v_mov_b32_e32 v2, s14
-; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s12
+; HEURRC-NEXT: v_mov_b32_e32 v3, s13
+; HEURRC-NEXT: v_mov_b32_e32 v4, s14
+; HEURRC-NEXT: v_mov_b32_e32 v5, s15
+; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s8
+; HEURRC-NEXT: v_mov_b32_e32 v3, s9
+; HEURRC-NEXT: v_mov_b32_e32 v4, s10
+; HEURRC-NEXT: v_mov_b32_e32 v5, s11
+; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -4845,17 +4841,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v32, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v32, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v33, s21
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT: v_mov_b32_e32 v34, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v35, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v36, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v37, s23
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b32_e32 v36, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v37, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v38, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v39, s27
+; VGPRRC-NEXT: v_mov_b32_e32 v38, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v39, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s27
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
@@ -4866,42 +4862,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31] cbsz:1 abid:2 blgp:3
; VGPRRC-NEXT: s_nop 6
; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
; VGPRRC-NEXT: v_mov_b32_e32 v18, s22
; VGPRRC-NEXT: v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index 6b7e8cba00a36..fb1e46d955752 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -8,26 +8,6 @@ declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>,
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32)
define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 {
-; GFX942-LABEL: test_mfma_f32_16x16x8xf32:
-; GFX942: ; %bb.0: ; %bb
-; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v2, 0x40400000
-; GFX942-NEXT: v_mov_b32_e32 v3, 4.0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: s_nop 6
-; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -36,42 +16,6 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
-; GFX942-LABEL: test_mfma_f32_32x32x4xf32:
-; GFX942: ; %bb.0: ; %bb
-; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v2, 0x40400000
-; GFX942-NEXT: v_mov_b32_e32 v3, 4.0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GFX942-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -82,4 +26,5 @@ bb:
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
+; GFX942: {{.*}}
; GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 44c88f847f92e..eb0d5465cacd9 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -127,34 +127,26 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
; GFX908: ; %bb.0:
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a0
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a2
-; GFX908-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: v_accvgpr_write_b32 a8, v7
-; GFX908-NEXT: v_accvgpr_write_b32 a2, v8
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
; GFX908-NEXT: v_accvgpr_write_b32 a4, v3
-; GFX908-NEXT: v_accvgpr_write_b32 a7, v6
-; GFX908-NEXT: v_accvgpr_write_b32 a6, v5
-; GFX908-NEXT: v_accvgpr_write_b32 a5, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v1
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v5
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_endpgm
@@ -173,16 +165,16 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v9, a3
-; GFX90A-NEXT: v_accvgpr_read_b32 v8, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a2
; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v8
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v5
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_endpgm
More information about the llvm-commits
mailing list