[llvm] 6118a25 - [AMDGPU] Allocate AVRegClass last (#146606)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 25 13:46:45 PDT 2025
Author: Jeffrey Byrnes
Date: 2025-07-25T13:46:41-07:00
New Revision: 6118a254ff9acb4b54a6bdb952c088a792b679af
URL: https://github.com/llvm/llvm-project/commit/6118a254ff9acb4b54a6bdb952c088a792b679af
DIFF: https://github.com/llvm/llvm-project/commit/6118a254ff9acb4b54a6bdb952c088a792b679af.diff
LOG: [AMDGPU] Allocate AVRegClass last (#146606)
This changes the RC priorities such that AVRegClass is the least
prioritized. These registers are less constrained than the VRegClass and
ARegClass as they can be either agpr or vgpr. Thus, assigning them last
removes unnecessary constraints from VRegClass and ARegClass
assignments, and allows the RA to make smarter decisions about whether
to use vgpr / agpr for AVRegClass.
We only have 5 bits for RC priorities, and we still want to prioritize
larger RCs over smaller ones. Since this new prioritization uses the 5th
bit for AVRegClass vs ARegClass / VRegClass, we only have 4 bits to
encode the size priorities. Previously, each RC with a distinct size,
had a distinct priority. However, this PR groups together multiple sizes
to the same priority. Currently, this will have no effect on
prioritization in practice because we only have one actually defined RC
per group per vector register type.
For example, a register class with 15 or 16 32bit registers will have
the same size priority (14). However, we only have VReg_512 (VReg_480
doesn't exist), so only one actual RC in VRegClass has this priority.
Similarly, we give register class with 17-32+ 32 bit registers a size
priority of 15, but we only have VReg_1024.
The effect of this PR is to prioritize first the vector register type
(VReg & Areg have top priority, then AVReg), with the size of the
register class having second priority.
Passes PSDB.
---------
Co-authored-by: Matt Arsenault <Matthew.Arsenault at amd.com>
Added:
llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir
Modified:
llvm/lib/Target/AMDGPU/SIRegisterInfo.td
llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
llvm/test/CodeGen/AMDGPU/bf16.ll
llvm/test/CodeGen/AMDGPU/build_vector.ll
llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
llvm/test/CodeGen/AMDGPU/packed-fp32.ll
llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
llvm/test/CodeGen/AMDGPU/spill-agpr.ll
llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 0039d2ffa100e..218841d182ff2 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -109,6 +109,23 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
let TSFlags{2} = HasVGPR;
let TSFlags{3} = HasAGPR;
let TSFlags{4} = HasSGPR;
+
+ // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block)
+ // to decide which registers to try to assign first. Usually, this RegisterClass priority is given
+ // very high priority, if not the highest priority, when considering which VirtReg to allocate next.
+ //
+ // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to
+ // assign more constrained RegisterClasses first. As a result, we prioritize register classes with
+ // more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32).
+ //
+ // The interesting case is the vector register case on architectures which have ARegs, VRegs, AVRegs.
+ // In this case, we would like to assign ARegs and VRegs before AVRegs, as AVRegs are less constrained
+ // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the
+ // RegisterClass AllocationPriority. BaseClassPriority is used to turn the bit on, and BaseClassScaleFactor
+ // is used for scaling of the bit (i.e. 1 << 4).
+ field int BaseClassPriority = 1;
+ field int BaseClassScaleFactor = 16;
+
}
multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
@@ -575,7 +592,7 @@ let HasVGPR = 1 in {
def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
(add (interleave (sequence "VGPR%u_LO16", 0, 255),
(sequence "VGPR%u_HI16", 0, 255)))> {
- let AllocationPriority = 2;
+ let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 16;
let GeneratePressureSet = 0;
@@ -601,7 +618,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
// i16/f16 only on VI+
def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
(add (sequence "VGPR%u", 0, 255))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 32;
let Weight = 1;
let BaseClassOrder = 32;
@@ -610,7 +627,7 @@ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types
// Identical to VGPR_32 except it only contains the low 128 (Lo128) registers.
def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
(add (sequence "VGPR%u", 0, 127))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
let GeneratePressureSet = 0;
let Size = 32;
let Weight = 1;
@@ -668,7 +685,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
// AccVGPR 32-bit registers
def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
(add (sequence "AGPR%u", 0, 255))> {
- let AllocationPriority = 0;
+ let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
let Size = 32;
let Weight = 1;
let BaseClassOrder = 32;
@@ -940,14 +957,23 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
// Requires n v_mov_b32 to copy
let CopyCost = numRegs;
- let AllocationPriority = !sub(numRegs, 1);
+
+ // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the
+ // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result
+ // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for
+ // regsters with numRegs 17+ we give SizePriority of 15. In practice, there is only one
+ // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512},
+ // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing.
+ defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15));
+
+ let AllocationPriority = !add(SizePrioriity, !mul(BaseClassPriority, BaseClassScaleFactor));
let Weight = numRegs;
}
// Define a register tuple class, along with one requiring an even
// aligned base register.
multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
- let HasVGPR = 1 in {
+ let HasVGPR = 1, BaseClassPriority = 1 in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, regList> {
let BaseClassOrder = !mul(numRegs, 32);
@@ -981,7 +1007,7 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>;
}
multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
- let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in {
+ let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, regList> {
let BaseClassOrder = !mul(numRegs, 32);
@@ -1066,6 +1092,7 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6
def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> {
let HasVGPR = 1;
let HasAGPR = 1;
+ let BaseClassPriority = 0;
let Size = 32;
}
} // End GeneratePressureSet = 0
@@ -1074,7 +1101,7 @@ def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_3
// aligned base register.
multiclass AVRegClass<int numRegs, list<ValueType> regTypes,
dag vregList, dag aregList> {
- let HasVGPR = 1, HasAGPR = 1 in {
+ let HasVGPR = 1, HasAGPR = 1, BaseClassPriority = 0 in {
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 8192d4a6457d7..0e132f130c844 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -70,12 +70,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
@@ -85,12 +85,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -153,12 +153,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -168,12 +168,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -236,12 +236,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
@@ -251,12 +251,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -319,12 +319,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -334,12 +334,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -402,12 +402,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
@@ -417,12 +417,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -485,12 +485,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -500,12 +500,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -568,12 +568,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
@@ -583,12 +583,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -651,12 +651,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -666,12 +666,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -734,12 +734,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
@@ -749,12 +749,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -817,12 +817,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -832,12 +832,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -900,12 +900,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
@@ -915,12 +915,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -983,12 +983,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -998,12 +998,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir b/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
index 1a457c94778fd..9241a23fca433 100644
--- a/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
+++ b/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
@@ -38,20 +38,20 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: early-clobber renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: ("amdgpu-noclobber" load (s128), addrspace 1)
- ; CHECK-NEXT: renamable $vgpr4 = V_MOV_B32_e32 1065353216, implicit $exec
- ; CHECK-NEXT: renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: renamable $vgpr6 = V_MOV_B32_e32 1073741824, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr4 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr1 = V_MOV_B32_e32 1073741824, implicit $exec
; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $vgpr1 = COPY renamable $agpr1
- ; CHECK-NEXT: renamable $vgpr0 = COPY renamable $agpr0
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $vgpr3 = COPY renamable $agpr1
- ; CHECK-NEXT: renamable $vgpr2 = COPY killed renamable $agpr0
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr4, killed $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr0, $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr6 = COPY renamable $agpr1
+ ; CHECK-NEXT: renamable $vgpr5 = COPY renamable $agpr0
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr0, $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr8 = COPY renamable $agpr1
+ ; CHECK-NEXT: renamable $vgpr7 = COPY killed renamable $agpr0
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr5_vgpr6_vgpr7_vgpr8
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr5, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr4, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: ("amdgpu-noclobber" load (s128), addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 7859fcdfe8a70..52e697cae9fe5 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -468,15 +468,28 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_load_global_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v9, v1
-; GFX9-NEXT: v_mov_b32_e32 v8, v0
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_load_global_v16bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v9, v1
+; GFX900-NEXT: v_mov_b32_e32 v8, v0
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_load_global_v16bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
+; GFX950-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX950-NEXT: s_waitcnt vmcnt(1)
+; GFX950-NEXT: v_mov_b32_e32 v0, v8
+; GFX950-NEXT: v_mov_b32_e32 v1, v9
+; GFX950-NEXT: v_mov_b32_e32 v2, v10
+; GFX950-NEXT: v_mov_b32_e32 v3, v11
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v16bf16:
; GFX10: ; %bb.0:
@@ -619,17 +632,32 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_load_global_v32bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v17, v1
-; GFX9-NEXT: v_mov_b32_e32 v16, v0
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_load_global_v32bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v17, v1
+; GFX900-NEXT: v_mov_b32_e32 v16, v0
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
+; GFX900-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
+; GFX900-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_load_global_v32bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dwordx4 v[16:19], v[0:1], off
+; GFX950-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX950-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
+; GFX950-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX950-NEXT: s_waitcnt vmcnt(3)
+; GFX950-NEXT: v_mov_b32_e32 v0, v16
+; GFX950-NEXT: v_mov_b32_e32 v1, v17
+; GFX950-NEXT: v_mov_b32_e32 v2, v18
+; GFX950-NEXT: v_mov_b32_e32 v3, v19
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v32bf16:
; GFX10: ; %bb.0:
@@ -877,22 +905,41 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_load_global_v64bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v29, v1
-; GFX9-NEXT: v_mov_b32_e32 v28, v0
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[28:29], off
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48
-; GFX9-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64
-; GFX9-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80
-; GFX9-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_load_global_v64bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v29, v1
+; GFX900-NEXT: v_mov_b32_e32 v28, v0
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v[28:29], off
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16
+; GFX900-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32
+; GFX900-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48
+; GFX900-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64
+; GFX900-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80
+; GFX900-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_load_global_v64bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dwordx4 v[32:35], v[0:1], off
+; GFX950-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX950-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
+; GFX950-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX950-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:64
+; GFX950-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:80
+; GFX950-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:96
+; GFX950-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:112
+; GFX950-NEXT: s_waitcnt vmcnt(7)
+; GFX950-NEXT: v_mov_b32_e32 v0, v32
+; GFX950-NEXT: v_mov_b32_e32 v1, v33
+; GFX950-NEXT: v_mov_b32_e32 v2, v34
+; GFX950-NEXT: v_mov_b32_e32 v3, v35
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v64bf16:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 7208eaeff8eb1..763f436997c21 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -51,11 +51,11 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
; GFX942-LABEL: build_vector2:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, 5
-; GFX942-NEXT: v_mov_b32_e32 v1, 6
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 5
+; GFX942-NEXT: v_mov_b32_e32 v3, 6
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
store <2 x i32> <i32 5, i32 6>, ptr addrspace(1) %out
@@ -116,13 +116,13 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
; GFX942-LABEL: build_vector4:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, 5
-; GFX942-NEXT: v_mov_b32_e32 v1, 6
-; GFX942-NEXT: v_mov_b32_e32 v2, 7
-; GFX942-NEXT: v_mov_b32_e32 v3, 8
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 5
+; GFX942-NEXT: v_mov_b32_e32 v3, 6
+; GFX942-NEXT: v_mov_b32_e32 v4, 7
+; GFX942-NEXT: v_mov_b32_e32 v5, 8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, ptr addrspace(1) %out
@@ -307,13 +307,13 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
; GFX942-LABEL: build_v2i32_from_v4i16_shuffle:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_lshl_b32 s3, s3, 16
; GFX942-NEXT: s_lshl_b32 s2, s2, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, s2
-; GFX942-NEXT: v_mov_b32_e32 v1, s3
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, s2
+; GFX942-NEXT: v_mov_b32_e32 v3, s3
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
%shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
index 87811968c7871..4f752d102db74 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
@@ -8,10 +8,10 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt
; GFX942-LABEL: global_load_lds_dword_saddr:
; GFX942: ; %bb.0: ; %main_body
; GFX942-NEXT: v_readfirstlane_b32 s2, v0
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_mov_b32 m0, s2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_load_lds_dword v2, s[0:1] offset:32 nt
+; GFX942-NEXT: global_load_lds_dword v1, s[0:1] offset:32 nt
; GFX942-NEXT: s_getpc_b64 s[0:1]
; GFX942-NEXT: s_add_u32 s0, s0, G at gotpcrel32@lo+4
; GFX942-NEXT: s_addc_u32 s1, s1, G at gotpcrel32@hi+12
@@ -21,9 +21,9 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_mul_i32 s3, s3, 10
; GFX942-NEXT: s_mul_i32 s2, s2, 10
-; GFX942-NEXT: v_mov_b32_e32 v0, s2
-; GFX942-NEXT: v_mov_b32_e32 v1, s3
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, s2
+; GFX942-NEXT: v_mov_b32_e32 v3, s3
+; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX90A-LABEL: global_load_lds_dword_saddr:
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 873fceedd7b72..6067194d947fa 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -71,12 +71,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
@@ -86,12 +86,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -154,12 +154,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -169,12 +169,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -237,12 +237,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
@@ -252,12 +252,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -320,12 +320,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -335,12 +335,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -403,12 +403,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
@@ -418,12 +418,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -486,12 +486,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -501,12 +501,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -569,12 +569,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
@@ -584,12 +584,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -652,12 +652,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -667,12 +667,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -735,12 +735,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
@@ -750,12 +750,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -818,12 +818,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -833,12 +833,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -901,12 +901,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
@@ -916,12 +916,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -984,12 +984,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -999,12 +999,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
index 258aa9e299c3d..0a493e5188ad5 100644
--- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
@@ -8,15 +8,15 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) {
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_ashr_i32 s7, s6, 31
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-NEXT: v_mov_b32_e32 v3, s3
; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3
; CHECK-NEXT: s_add_u32 s0, s2, s0
; CHECK-NEXT: s_addc_u32 s1, s3, s1
-; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0
-; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: s_endpgm
@@ -35,15 +35,15 @@ define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, doub
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_ashr_i32 s7, s6, 31
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-NEXT: v_mov_b32_e32 v3, s3
; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3
; CHECK-NEXT: s_add_u32 s0, s0, s2
; CHECK-NEXT: s_addc_u32 s1, s1, s3
-; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0
-; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
index 11de6c8d52d59..06c3da09eede9 100644
--- a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
@@ -32,32 +32,14 @@
# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY undef $vgpr2_vgpr3 {
# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY undef $vgpr0
# CHECK-NEXT: }
-# CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT1]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT1]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: SI_SPILL_AV512_SAVE [[SPLIT2]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5)
-# CHECK-NEXT: [[RESTORE1:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
-# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub0_sub1:av_512_align2 = COPY [[RESTORE1]].sub0_sub1
-# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5)
-# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[RESTORE2]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT3]].sub0:av_512_align2 = COPY [[RESTORE2]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: undef [[SPLIT4:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT3]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT4]].sub0:av_512_align2 = COPY [[SPLIT3]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: [[SPLIT5:%[0-9]+]].sub2:av_512_align2 = COPY [[SPLIT4]].sub3
-# CHECK-NEXT: undef [[SPLIT6:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT5]].sub0_sub1_sub2
-# CHECK-NEXT: undef [[SPLIT7:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT6]].sub0_sub1_sub2
-# CHECK-NEXT: undef [[SPLIT8:%[0-9]+]].sub0:av_512_align2 = COPY [[SPLIT4]].sub0 {
-# CHECK-NEXT: internal [[SPLIT8]].sub2:av_512_align2 = COPY [[SPLIT4]].sub2
+# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
# CHECK-NEXT: }
-# CHECK-NEXT: [[SPLIT9:%[0-9]+]].sub3:av_512_align2 = COPY [[SPLIT8]].sub2
-# CHECK-NEXT: undef [[SPLIT10:%[0-9]+]].sub0_sub1_sub2_sub3:av_512_align2 = COPY [[SPLIT9]].sub0_sub1_sub2_sub3
-# CHECK-NEXT: undef [[SPLIT13:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_512_align2 = COPY [[SPLIT10]].sub0_sub1_sub2_sub3
-# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub4:vreg_512_align2 = COPY [[SPLIT8]].sub0
+# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub0_sub1:vreg_512_align2 = COPY [[RESTORE2]].sub0_sub1
+# CHECK-NEXT: [[MFMA_USE1]].sub2:vreg_512_align2 = COPY [[SPLIT2]].sub3
+# CHECK-NEXT: [[MFMA_USE1]].sub3:vreg_512_align2 = COPY [[SPLIT2]].sub2
+# CHECK-NEXT: [[MFMA_USE1]].sub4:vreg_512_align2 = COPY [[SPLIT2]].sub0
# CHECK-NEXT: [[MFMA_USE1]].sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
# CHECK-NEXT: [[MFMA_USE1]].sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
# CHECK-NEXT: [[MFMA_USE1]].sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index 1ac75d399e67d..d8c983a081b98 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -1331,16 +1331,16 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX942-NEXT: s_load_dword s6, s[4:5], 0x10
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, 0x5040100
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 5, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0x5040100
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3]
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v4, s[2:3] offset:16
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_perm_b32 v1, s6, v1, v9
+; GFX942-NEXT: v_perm_b32 v1, s6, v1, v5
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir b/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir
new file mode 100644
index 0000000000000..58e9b0a1aedd4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-regalloc -greedy-regclass-priority-trumps-globalness=1 -start-after=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck %s
+
+--- |
+ define void @temp_vgpr_to_agpr_should_not_undo_split_with_remat() #0 {
+ entry:
+ unreachable
+ }
+
+ attributes #0 = { "amdgpu-agpr-alloc"="0,0" }
+...
+
+
+---
+name: temp_vgpr_to_agpr_should_not_undo_split_with_remat
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ workGroupIDX: { reg: '$sgpr6' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+ workItemIDX: { reg: '$vgpr0' }
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+ ; CHECK-LABEL: name: temp_vgpr_to_agpr_should_not_undo_split_with_remat
+ ; CHECK: liveins: $vgpr0, $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+ ; CHECK-NEXT: dead renamable $vgpr1 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr1 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr3 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr4 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr5 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr6 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr8 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr9 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr10 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr12 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr13 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr14 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr15 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr17 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr18 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr19 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr20 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr21 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr22 = IMPLICIT_DEF
+ ; CHECK-NEXT: KILL killed renamable $vgpr2, killed renamable $vgpr3, killed renamable $vgpr4, killed renamable $vgpr5, killed renamable $vgpr6, killed renamable $vgpr7, killed renamable $vgpr8, killed renamable $vgpr9, killed renamable $vgpr10, killed renamable $vgpr11, killed renamable $vgpr12, killed renamable $vgpr13, killed renamable $vgpr14, killed renamable $vgpr15, killed renamable $vgpr16
+ ; CHECK-NEXT: S_NOP 0, implicit-def renamable $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38
+ ; CHECK-NEXT: S_NOP 0, implicit-def renamable $vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54
+ ; CHECK-NEXT: KILL killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr17, killed renamable $vgpr18, killed renamable $vgpr19, killed renamable $vgpr20, killed renamable $vgpr21, killed renamable $vgpr22
+ ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, implicit killed renamable $vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54
+ ; CHECK-NEXT: S_ENDPGM 0
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = IMPLICIT_DEF
+ %4:vgpr_32 = IMPLICIT_DEF
+ %5:vgpr_32 = IMPLICIT_DEF
+ %6:vgpr_32 = IMPLICIT_DEF
+ %7:vgpr_32 = IMPLICIT_DEF
+ %8:vgpr_32 = IMPLICIT_DEF
+ %9:vgpr_32 = IMPLICIT_DEF
+ %10:vgpr_32 = IMPLICIT_DEF
+ %11:vgpr_32 = IMPLICIT_DEF
+ %12:vgpr_32 = IMPLICIT_DEF
+ %13:vgpr_32 = IMPLICIT_DEF
+ %14:vgpr_32 = IMPLICIT_DEF
+ %15:vgpr_32 = IMPLICIT_DEF
+ %16:vgpr_32 = IMPLICIT_DEF
+ %17:vgpr_32 = IMPLICIT_DEF
+ %18:vgpr_32 = IMPLICIT_DEF
+ %19:vgpr_32 = IMPLICIT_DEF
+ %20:vgpr_32 = IMPLICIT_DEF
+ %21:vgpr_32 = IMPLICIT_DEF
+ %22:vgpr_32 = IMPLICIT_DEF
+ %23:vgpr_32 = IMPLICIT_DEF
+ KILL %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16, %17
+ S_NOP 0, implicit-def %50:av_512
+ S_NOP 0, implicit-def %51:av_512
+ KILL %1, %2, %18, %19, %20, %21, %22, %23
+ S_NOP 0, implicit %50, implicit %51
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
index 85dd275207902..fcdad53553823 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
@@ -4,30 +4,30 @@
define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(<1 x i64> %L1) {
; GCN-LABEL: test_iglp_opt_rev_mfma_gemm:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: v_mov_b32_e32 v32, 0
-; GCN-NEXT: ds_read_b128 v[0:3], v32
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: ds_read_b128 v[2:5], v0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GCN-NEXT: ds_read_b128 v[28:31], v32 offset:112
-; GCN-NEXT: ds_read_b128 v[24:27], v32 offset:96
-; GCN-NEXT: ds_read_b128 v[20:23], v32 offset:80
-; GCN-NEXT: ds_read_b128 v[16:19], v32 offset:64
-; GCN-NEXT: ds_read_b128 v[4:7], v32 offset:16
-; GCN-NEXT: ds_read_b128 v[8:11], v32 offset:32
-; GCN-NEXT: ds_read_b128 v[12:15], v32 offset:48
+; GCN-NEXT: ds_read_b128 v[30:33], v0 offset:112
+; GCN-NEXT: ds_read_b128 v[26:29], v0 offset:96
+; GCN-NEXT: ds_read_b128 v[22:25], v0 offset:80
+; GCN-NEXT: ds_read_b128 v[18:21], v0 offset:64
+; GCN-NEXT: ds_read_b128 v[6:9], v0 offset:16
+; GCN-NEXT: ds_read_b128 v[10:13], v0 offset:32
+; GCN-NEXT: ds_read_b128 v[14:17], v0 offset:48
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: ds_write_b128 v32, v[0:3]
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: ds_write_b128 v0, v[2:5]
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_mov_b32_e32 v3, v2
; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0
; GCN-NEXT: ; iglp_opt mask(0x00000001)
-; GCN-NEXT: ds_write_b128 v32, v[28:31] offset:112
-; GCN-NEXT: ds_write_b128 v32, v[24:27] offset:96
-; GCN-NEXT: ds_write_b128 v32, v[20:23] offset:80
-; GCN-NEXT: ds_write_b128 v32, v[16:19] offset:64
-; GCN-NEXT: ds_write_b128 v32, v[12:15] offset:48
-; GCN-NEXT: ds_write_b128 v32, v[8:11] offset:32
-; GCN-NEXT: ds_write_b128 v32, v[4:7] offset:16
-; GCN-NEXT: ds_write_b64 v32, v[0:1]
+; GCN-NEXT: ds_write_b128 v0, v[30:33] offset:112
+; GCN-NEXT: ds_write_b128 v0, v[26:29] offset:96
+; GCN-NEXT: ds_write_b128 v0, v[22:25] offset:80
+; GCN-NEXT: ds_write_b128 v0, v[18:21] offset:64
+; GCN-NEXT: ds_write_b128 v0, v[14:17] offset:48
+; GCN-NEXT: ds_write_b128 v0, v[10:13] offset:32
+; GCN-NEXT: ds_write_b128 v0, v[6:9] offset:16
+; GCN-NEXT: ds_write_b64 v0, v[2:3]
; GCN-NEXT: s_endpgm
entry:
call void @llvm.amdgcn.iglp.opt(i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
index ed7d88b162bac..dcac419f8591d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
@@ -18,19 +18,22 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1
; GCN-LABEL: load_1d_lwe:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: v_mov_b32_e32 v8, 0
-; GCN-NEXT: v_mov_b32_e32 v6, v0
; GCN-NEXT: v_mov_b32_e32 v9, v8
; GCN-NEXT: v_mov_b32_e32 v10, v8
; GCN-NEXT: v_mov_b32_e32 v11, v8
; GCN-NEXT: v_mov_b32_e32 v12, v8
-; GCN-NEXT: v_mov_b32_e32 v0, v8
-; GCN-NEXT: v_mov_b32_e32 v1, v9
-; GCN-NEXT: v_mov_b32_e32 v2, v10
-; GCN-NEXT: v_mov_b32_e32 v3, v11
-; GCN-NEXT: v_mov_b32_e32 v4, v12
-; GCN-NEXT: image_load v[0:4], v6, s[0:7] dmask:0xf unorm lwe
+; GCN-NEXT: v_mov_b32_e32 v2, v8
+; GCN-NEXT: v_mov_b32_e32 v3, v9
+; GCN-NEXT: v_mov_b32_e32 v4, v10
+; GCN-NEXT: v_mov_b32_e32 v5, v11
+; GCN-NEXT: v_mov_b32_e32 v6, v12
+; GCN-NEXT: image_load v[2:6], v0, s[0:7] dmask:0xf unorm lwe
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dword v8, v4, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: v_mov_b32_e32 v2, v4
+; GCN-NEXT: v_mov_b32_e32 v3, v5
+; GCN-NEXT: global_store_dword v8, v6, s[8:9]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
@@ -75,6 +78,27 @@ main_body:
}
define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
+; GCN-LABEL: load_cube_lwe:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v11, v10
+; GCN-NEXT: v_mov_b32_e32 v12, v10
+; GCN-NEXT: v_mov_b32_e32 v13, v10
+; GCN-NEXT: v_mov_b32_e32 v14, v10
+; GCN-NEXT: v_mov_b32_e32 v4, v10
+; GCN-NEXT: v_mov_b32_e32 v5, v11
+; GCN-NEXT: v_mov_b32_e32 v6, v12
+; GCN-NEXT: v_mov_b32_e32 v7, v13
+; GCN-NEXT: v_mov_b32_e32 v8, v14
+; GCN-NEXT: image_load v[4:8], v[0:2], s[0:7] dmask:0xf unorm lwe da
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, v4
+; GCN-NEXT: v_mov_b32_e32 v1, v5
+; GCN-NEXT: v_mov_b32_e32 v2, v6
+; GCN-NEXT: v_mov_b32_e32 v3, v7
+; GCN-NEXT: global_store_dword v10, v8, s[8:9]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
@@ -106,6 +130,27 @@ main_body:
}
define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
+; GCN-LABEL: load_2darray_lwe:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v11, v10
+; GCN-NEXT: v_mov_b32_e32 v12, v10
+; GCN-NEXT: v_mov_b32_e32 v13, v10
+; GCN-NEXT: v_mov_b32_e32 v14, v10
+; GCN-NEXT: v_mov_b32_e32 v4, v10
+; GCN-NEXT: v_mov_b32_e32 v5, v11
+; GCN-NEXT: v_mov_b32_e32 v6, v12
+; GCN-NEXT: v_mov_b32_e32 v7, v13
+; GCN-NEXT: v_mov_b32_e32 v8, v14
+; GCN-NEXT: image_load v[4:8], v[0:2], s[0:7] dmask:0xf unorm lwe da
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, v4
+; GCN-NEXT: v_mov_b32_e32 v1, v5
+; GCN-NEXT: v_mov_b32_e32 v2, v6
+; GCN-NEXT: v_mov_b32_e32 v3, v7
+; GCN-NEXT: global_store_dword v10, v8, s[8:9]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 780c7e93f80d0..572b72737fece 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -443,10 +443,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: v_mov_b32_e32 v2, s10
; GFX90A-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s11
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v3, s11
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
@@ -457,7 +457,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
@@ -471,10 +471,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: v_mov_b32_e32 v2, s10
; GFX942-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
@@ -485,7 +485,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
@@ -898,20 +898,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0x3ff00000
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x3ff00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
@@ -925,20 +925,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, 0x3ff00000
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x3ff00000
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s2
-; GFX942-NEXT: v_mov_b32_e32 v1, s3
+; GFX942-NEXT: v_mov_b32_e32 v2, s2
+; GFX942-NEXT: v_mov_b32_e32 v3, s3
; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
@@ -957,21 +957,21 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0x405ec000
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x405ec000
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NEXT: v_mov_b32_e32 v3, s3
; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a1
; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a1
; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a1
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
@@ -984,21 +984,21 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_mov_b32_e32 v2, 0x405ec000
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x405ec000
; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s2
-; GFX942-NEXT: v_mov_b32_e32 v1, s3
+; GFX942-NEXT: v_mov_b32_e32 v2, s2
+; GFX942-NEXT: v_mov_b32_e32 v3, s3
; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1
; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a5, a1
; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a7, a1
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index 92af34f25a1a6..7d85d3439eed9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -35,26 +35,26 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32>, <4 x i3
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_i32_16x16x32i8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -77,26 +77,47 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_i32_16x16x32i8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -118,6 +139,27 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -281,26 +323,26 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -323,26 +365,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -364,6 +427,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -372,26 +456,26 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -414,26 +498,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -455,6 +560,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -463,26 +589,26 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -505,26 +631,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -546,6 +693,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -554,26 +722,26 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -596,26 +764,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -637,6 +826,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -1269,20 +1479,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1291,18 +1501,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1332,20 +1542,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1354,18 +1564,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1681,20 +1891,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1703,18 +1913,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1744,20 +1954,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1766,18 +1976,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -2093,23 +2303,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2119,21 +2329,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2197,23 +2407,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2223,21 +2433,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2309,16 +2519,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -2327,7 +2537,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -2461,16 +2671,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -2479,7 +2689,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
@@ -2619,23 +2829,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2645,21 +2855,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2723,23 +2933,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2749,21 +2959,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2834,23 +3044,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -2860,21 +3070,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -2938,23 +3148,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -2964,21 +3174,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -3049,23 +3259,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3075,21 +3285,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3153,23 +3363,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3179,21 +3389,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3264,23 +3474,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3290,21 +3500,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3368,23 +3578,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3394,21 +3604,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3480,16 +3690,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3498,7 +3708,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -3632,16 +3842,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3650,7 +3860,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
@@ -3791,16 +4001,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3809,7 +4019,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -3943,16 +4153,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3961,7 +4171,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
@@ -4102,16 +4312,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4120,7 +4330,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -4254,16 +4464,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4272,7 +4482,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
@@ -4413,16 +4623,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4431,7 +4641,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -4565,16 +4775,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4583,7 +4793,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 452033f332659..d358837452eab 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -15,9 +15,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GCN-NEXT: v_mov_b64_e32 v[12:13], 48
-; GCN-NEXT: v_mov_b64_e32 v[14:15], 32
-; GCN-NEXT: v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT: v_mov_b64_e32 v[8:9], 48
+; GCN-NEXT: v_mov_b64_e32 v[10:11], 32
+; GCN-NEXT: v_mov_b64_e32 v[12:13], 16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -39,42 +39,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
-; GCN-NEXT: v_mov_b64_e32 v[18:19], 0
-; GCN-NEXT: v_mov_b32_e32 v8, s16
+; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
+; GCN-NEXT: v_mov_b32_e32 v16, s16
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15]
; GCN-NEXT: v_mov_b32_e32 v0, s20
; GCN-NEXT: v_mov_b32_e32 v1, s21
; GCN-NEXT: v_mov_b32_e32 v2, s22
; GCN-NEXT: v_mov_b32_e32 v3, s23
-; GCN-NEXT: v_mov_b32_e32 v9, s17
-; GCN-NEXT: v_mov_b32_e32 v10, s18
-; GCN-NEXT: v_mov_b32_e32 v11, s19
+; GCN-NEXT: v_mov_b32_e32 v17, s17
+; GCN-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NEXT: v_mov_b32_e32 v19, s19
; GCN-NEXT: s_nop 4
-; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -88,9 +88,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GCN-NEXT: v_mov_b64_e32 v[12:13], 48
-; GCN-NEXT: v_mov_b64_e32 v[14:15], 32
-; GCN-NEXT: v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT: v_mov_b64_e32 v[8:9], 48
+; GCN-NEXT: v_mov_b64_e32 v[10:11], 32
+; GCN-NEXT: v_mov_b64_e32 v[12:13], 16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -112,42 +112,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
-; GCN-NEXT: v_mov_b64_e32 v[18:19], 0
-; GCN-NEXT: v_mov_b32_e32 v8, s16
+; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
+; GCN-NEXT: v_mov_b32_e32 v16, s16
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; GCN-NEXT: v_mov_b32_e32 v0, s20
; GCN-NEXT: v_mov_b32_e32 v1, s21
; GCN-NEXT: v_mov_b32_e32 v2, s22
; GCN-NEXT: v_mov_b32_e32 v3, s23
-; GCN-NEXT: v_mov_b32_e32 v9, s17
-; GCN-NEXT: v_mov_b32_e32 v10, s18
-; GCN-NEXT: v_mov_b32_e32 v11, s19
+; GCN-NEXT: v_mov_b32_e32 v17, s17
+; GCN-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NEXT: v_mov_b32_e32 v19, s19
; GCN-NEXT: s_nop 4
-; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
@@ -252,7 +252,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -274,40 +274,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
; GCN-NEXT: v_accvgpr_write_b32 a18, s10
; GCN-NEXT: v_accvgpr_write_b32 a17, s9
; GCN-NEXT: v_accvgpr_write_b32 a16, s8
-; GCN-NEXT: v_mov_b32_e32 v8, s20
-; GCN-NEXT: v_mov_b32_e32 v9, s21
+; GCN-NEXT: v_mov_b32_e32 v10, s20
+; GCN-NEXT: v_mov_b32_e32 v11, s21
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31]
-; GCN-NEXT: v_mov_b32_e32 v10, s22
-; GCN-NEXT: v_mov_b32_e32 v11, s23
+; GCN-NEXT: v_mov_b32_e32 v12, s22
+; GCN-NEXT: v_mov_b32_e32 v13, s23
; GCN-NEXT: v_mov_b32_e32 v0, s16
; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: v_mov_b32_e32 v3, s19
-; GCN-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -322,7 +322,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -344,40 +344,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
; GCN-NEXT: v_accvgpr_write_b32 a18, s10
; GCN-NEXT: v_accvgpr_write_b32 a17, s9
; GCN-NEXT: v_accvgpr_write_b32 a16, s8
-; GCN-NEXT: v_mov_b32_e32 v8, s20
-; GCN-NEXT: v_mov_b32_e32 v9, s21
+; GCN-NEXT: v_mov_b32_e32 v10, s20
+; GCN-NEXT: v_mov_b32_e32 v11, s21
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; GCN-NEXT: v_mov_b32_e32 v10, s22
-; GCN-NEXT: v_mov_b32_e32 v11, s23
+; GCN-NEXT: v_mov_b32_e32 v12, s22
+; GCN-NEXT: v_mov_b32_e32 v13, s23
; GCN-NEXT: v_mov_b32_e32 v0, s16
; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: v_mov_b32_e32 v2, s18
; GCN-NEXT: v_mov_b32_e32 v3, s19
-; GCN-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 9bdae28f935c1..21465beb21de7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -394,9 +394,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -418,42 +418,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
; SDAG-NEXT: v_mov_b32_e32 v0, s20
; SDAG-NEXT: v_mov_b32_e32 v1, s21
; SDAG-NEXT: v_mov_b32_e32 v2, s22
; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
; SDAG-NEXT: s_nop 4
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -518,9 +518,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[16:17], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -542,42 +542,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[18:19], 0
-; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
; HEURRC-NEXT: v_mov_b32_e32 v0, s20
; HEURRC-NEXT: v_mov_b32_e32 v1, s21
; HEURRC-NEXT: v_mov_b32_e32 v2, s22
; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: v_mov_b32_e32 v9, s17
-; HEURRC-NEXT: v_mov_b32_e32 v10, s18
-; HEURRC-NEXT: v_mov_b32_e32 v11, s19
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
; HEURRC-NEXT: s_nop 4
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -585,9 +585,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[48:49], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
@@ -601,43 +601,43 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[50:51], 0
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s16
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b32_e32 v48, s16
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s17
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s18
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s19
+; VGPRRC-NEXT: v_mov_b32_e32 v49, s17
+; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
+; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16:
@@ -776,9 +776,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -800,42 +800,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; SDAG-NEXT: v_mov_b32_e32 v0, s20
; SDAG-NEXT: v_mov_b32_e32 v1, s21
; SDAG-NEXT: v_mov_b32_e32 v2, s22
; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
; SDAG-NEXT: s_nop 4
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -900,9 +900,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[16:17], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -924,42 +924,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[18:19], 0
-; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; HEURRC-NEXT: v_mov_b32_e32 v0, s20
; HEURRC-NEXT: v_mov_b32_e32 v1, s21
; HEURRC-NEXT: v_mov_b32_e32 v2, s22
; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: v_mov_b32_e32 v9, s17
-; HEURRC-NEXT: v_mov_b32_e32 v10, s18
-; HEURRC-NEXT: v_mov_b32_e32 v11, s19
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
; HEURRC-NEXT: s_nop 4
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -967,9 +967,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[48:49], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
@@ -983,43 +983,43 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[50:51], 0
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s16
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b32_e32 v48, s16
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s17
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s18
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s19
+; VGPRRC-NEXT: v_mov_b32_e32 v49, s17
+; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
+; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__flags:
@@ -1505,7 +1505,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v8, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -1527,40 +1527,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-NEXT: v_mov_b32_e32 v11, s21
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-NEXT: v_mov_b32_e32 v13, s23
; SDAG-NEXT: v_mov_b32_e32 v0, s16
; SDAG-NEXT: v_mov_b32_e32 v1, s17
; SDAG-NEXT: v_mov_b32_e32 v2, s18
; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -1623,7 +1623,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v12, 0
+; HEURRC-NEXT: v_mov_b32_e32 v8, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -1645,40 +1645,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
-; HEURRC-NEXT: v_mov_b32_e32 v8, s20
-; HEURRC-NEXT: v_mov_b32_e32 v9, s21
+; HEURRC-NEXT: v_mov_b32_e32 v10, s20
+; HEURRC-NEXT: v_mov_b32_e32 v11, s21
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
-; HEURRC-NEXT: v_mov_b32_e32 v10, s22
-; HEURRC-NEXT: v_mov_b32_e32 v11, s23
+; HEURRC-NEXT: v_mov_b32_e32 v12, s22
+; HEURRC-NEXT: v_mov_b32_e32 v13, s23
; HEURRC-NEXT: v_mov_b32_e32 v0, s16
; HEURRC-NEXT: v_mov_b32_e32 v1, s17
; HEURRC-NEXT: v_mov_b32_e32 v2, s18
; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -1687,7 +1687,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v44, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
@@ -1701,41 +1701,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s21
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s23
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: v_mov_b32_e32 v44, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v45, s23
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[42:45], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 2
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
@@ -1869,7 +1869,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v8, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -1891,40 +1891,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-NEXT: v_mov_b32_e32 v11, s21
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-NEXT: v_mov_b32_e32 v13, s23
; SDAG-NEXT: v_mov_b32_e32 v0, s16
; SDAG-NEXT: v_mov_b32_e32 v1, s17
; SDAG-NEXT: v_mov_b32_e32 v2, s18
; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -1987,7 +1987,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v12, 0
+; HEURRC-NEXT: v_mov_b32_e32 v8, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -2009,40 +2009,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
-; HEURRC-NEXT: v_mov_b32_e32 v8, s20
-; HEURRC-NEXT: v_mov_b32_e32 v9, s21
+; HEURRC-NEXT: v_mov_b32_e32 v10, s20
+; HEURRC-NEXT: v_mov_b32_e32 v11, s21
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT: v_mov_b32_e32 v10, s22
-; HEURRC-NEXT: v_mov_b32_e32 v11, s23
+; HEURRC-NEXT: v_mov_b32_e32 v12, s22
+; HEURRC-NEXT: v_mov_b32_e32 v13, s23
; HEURRC-NEXT: v_mov_b32_e32 v0, s16
; HEURRC-NEXT: v_mov_b32_e32 v1, s17
; HEURRC-NEXT: v_mov_b32_e32 v2, s18
; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -2051,7 +2051,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v44, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
@@ -2065,41 +2065,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s21
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s23
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: v_mov_b32_e32 v44, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v45, s23
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[42:45], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 2
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
@@ -2781,24 +2781,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3]
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2827,24 +2827,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: v_mov_b32_e32 v4, s12
-; HEURRC-NEXT: v_mov_b32_e32 v5, s13
-; HEURRC-NEXT: v_mov_b32_e32 v6, s14
-; HEURRC-NEXT: v_mov_b32_e32 v7, s15
+; HEURRC-NEXT: v_mov_b32_e32 v2, s8
+; HEURRC-NEXT: v_mov_b32_e32 v3, s9
+; HEURRC-NEXT: v_mov_b32_e32 v4, s10
+; HEURRC-NEXT: v_mov_b32_e32 v5, s11
+; HEURRC-NEXT: v_mov_b32_e32 v6, s12
+; HEURRC-NEXT: v_mov_b32_e32 v7, s13
+; HEURRC-NEXT: v_mov_b32_e32 v8, s14
+; HEURRC-NEXT: v_mov_b32_e32 v9, s15
; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3]
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2852,24 +2852,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v4, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
-; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
-; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
-; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
-; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
-; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
-; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
-; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
-; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
-; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
-; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
+; VGPRRC-NEXT: v_mov_b32_e32 v6, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v7, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v8, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v9, s11
+; VGPRRC-NEXT: v_mov_b32_e32 v10, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v11, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v12, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v13, s15
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s0
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s1
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s2
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s3
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3]
; VGPRRC-NEXT: s_nop 7
-; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
; AGPR: ; %bb.0:
@@ -2930,24 +2930,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -2976,24 +2976,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: v_mov_b32_e32 v4, s12
-; HEURRC-NEXT: v_mov_b32_e32 v5, s13
-; HEURRC-NEXT: v_mov_b32_e32 v6, s14
-; HEURRC-NEXT: v_mov_b32_e32 v7, s15
+; HEURRC-NEXT: v_mov_b32_e32 v2, s8
+; HEURRC-NEXT: v_mov_b32_e32 v3, s9
+; HEURRC-NEXT: v_mov_b32_e32 v4, s10
+; HEURRC-NEXT: v_mov_b32_e32 v5, s11
+; HEURRC-NEXT: v_mov_b32_e32 v6, s12
+; HEURRC-NEXT: v_mov_b32_e32 v7, s13
+; HEURRC-NEXT: v_mov_b32_e32 v8, s14
+; HEURRC-NEXT: v_mov_b32_e32 v9, s15
; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -3001,24 +3001,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v4, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
-; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
-; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
-; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
-; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
-; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
-; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
-; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
-; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
-; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
-; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
+; VGPRRC-NEXT: v_mov_b32_e32 v6, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v7, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v8, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v9, s11
+; VGPRRC-NEXT: v_mov_b32_e32 v10, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v11, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v12, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v13, s15
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s0
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s1
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s2
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s3
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
; VGPRRC-NEXT: s_nop 7
-; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
; AGPR: ; %bb.0:
@@ -3084,19 +3084,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s24
-; SDAG-NEXT: v_mov_b32_e32 v1, s25
-; SDAG-NEXT: v_mov_b32_e32 v2, s26
-; SDAG-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-NEXT: v_mov_b32_e32 v8, s24
+; SDAG-NEXT: v_mov_b32_e32 v9, s25
+; SDAG-NEXT: v_mov_b32_e32 v10, s26
+; SDAG-NEXT: v_mov_b32_e32 v11, s27
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v4, s28
-; SDAG-NEXT: v_mov_b32_e32 v5, s29
-; SDAG-NEXT: v_mov_b32_e32 v6, s30
-; SDAG-NEXT: v_mov_b32_e32 v7, s31
+; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v13, s29
+; SDAG-NEXT: v_mov_b32_e32 v14, s30
+; SDAG-NEXT: v_mov_b32_e32 v15, s31
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3112,44 +3112,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15]
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s22
+; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -3214,19 +3212,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s24
-; HEURRC-NEXT: v_mov_b32_e32 v1, s25
-; HEURRC-NEXT: v_mov_b32_e32 v2, s26
-; HEURRC-NEXT: v_mov_b32_e32 v3, s27
+; HEURRC-NEXT: v_mov_b32_e32 v8, s24
+; HEURRC-NEXT: v_mov_b32_e32 v9, s25
+; HEURRC-NEXT: v_mov_b32_e32 v10, s26
+; HEURRC-NEXT: v_mov_b32_e32 v11, s27
; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v4, s28
-; HEURRC-NEXT: v_mov_b32_e32 v5, s29
-; HEURRC-NEXT: v_mov_b32_e32 v6, s30
-; HEURRC-NEXT: v_mov_b32_e32 v7, s31
+; HEURRC-NEXT: v_mov_b32_e32 v12, s28
+; HEURRC-NEXT: v_mov_b32_e32 v13, s29
+; HEURRC-NEXT: v_mov_b32_e32 v14, s30
+; HEURRC-NEXT: v_mov_b32_e32 v15, s31
; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3242,44 +3240,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mov_b32_e32 v9, s17
+; HEURRC-NEXT: v_mov_b32_e32 v10, s18
+; HEURRC-NEXT: v_mov_b32_e32 v11, s19
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: v_mov_b32_e32 v8, s20
+; HEURRC-NEXT: v_mov_b32_e32 v9, s21
+; HEURRC-NEXT: v_mov_b32_e32 v10, s22
+; HEURRC-NEXT: v_mov_b32_e32 v11, s23
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -3287,19 +3283,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v32, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v33, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s27
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s27
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v36, s28
-; VGPRRC-NEXT: v_mov_b32_e32 v37, s29
-; VGPRRC-NEXT: v_mov_b32_e32 v38, s30
-; VGPRRC-NEXT: v_mov_b32_e32 v39, s31
+; VGPRRC-NEXT: v_mov_b32_e32 v44, s28
+; VGPRRC-NEXT: v_mov_b32_e32 v45, s29
+; VGPRRC-NEXT: v_mov_b32_e32 v46, s30
+; VGPRRC-NEXT: v_mov_b32_e32 v47, s31
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -3307,45 +3303,45 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0
; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[40:43], v[44:47], v[0:15]
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 3
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s16
; VGPRRC-NEXT: v_mov_b32_e32 v1, s17
; VGPRRC-NEXT: v_mov_b32_e32 v2, s18
; VGPRRC-NEXT: v_mov_b32_e32 v3, s19
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8:
@@ -3496,19 +3492,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s24
-; SDAG-NEXT: v_mov_b32_e32 v1, s25
-; SDAG-NEXT: v_mov_b32_e32 v2, s26
-; SDAG-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-NEXT: v_mov_b32_e32 v8, s24
+; SDAG-NEXT: v_mov_b32_e32 v9, s25
+; SDAG-NEXT: v_mov_b32_e32 v10, s26
+; SDAG-NEXT: v_mov_b32_e32 v11, s27
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v4, s28
-; SDAG-NEXT: v_mov_b32_e32 v5, s29
-; SDAG-NEXT: v_mov_b32_e32 v6, s30
-; SDAG-NEXT: v_mov_b32_e32 v7, s31
+; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v13, s29
+; SDAG-NEXT: v_mov_b32_e32 v14, s30
+; SDAG-NEXT: v_mov_b32_e32 v15, s31
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3524,44 +3520,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s22
+; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -3626,19 +3620,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s24
-; HEURRC-NEXT: v_mov_b32_e32 v1, s25
-; HEURRC-NEXT: v_mov_b32_e32 v2, s26
-; HEURRC-NEXT: v_mov_b32_e32 v3, s27
+; HEURRC-NEXT: v_mov_b32_e32 v8, s24
+; HEURRC-NEXT: v_mov_b32_e32 v9, s25
+; HEURRC-NEXT: v_mov_b32_e32 v10, s26
+; HEURRC-NEXT: v_mov_b32_e32 v11, s27
; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v4, s28
-; HEURRC-NEXT: v_mov_b32_e32 v5, s29
-; HEURRC-NEXT: v_mov_b32_e32 v6, s30
-; HEURRC-NEXT: v_mov_b32_e32 v7, s31
+; HEURRC-NEXT: v_mov_b32_e32 v12, s28
+; HEURRC-NEXT: v_mov_b32_e32 v13, s29
+; HEURRC-NEXT: v_mov_b32_e32 v14, s30
+; HEURRC-NEXT: v_mov_b32_e32 v15, s31
; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3654,44 +3648,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mov_b32_e32 v9, s17
+; HEURRC-NEXT: v_mov_b32_e32 v10, s18
+; HEURRC-NEXT: v_mov_b32_e32 v11, s19
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: v_mov_b32_e32 v8, s20
+; HEURRC-NEXT: v_mov_b32_e32 v9, s21
+; HEURRC-NEXT: v_mov_b32_e32 v10, s22
+; HEURRC-NEXT: v_mov_b32_e32 v11, s23
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -3699,19 +3691,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v32, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v33, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s27
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s27
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v36, s28
-; VGPRRC-NEXT: v_mov_b32_e32 v37, s29
-; VGPRRC-NEXT: v_mov_b32_e32 v38, s30
-; VGPRRC-NEXT: v_mov_b32_e32 v39, s31
+; VGPRRC-NEXT: v_mov_b32_e32 v44, s28
+; VGPRRC-NEXT: v_mov_b32_e32 v45, s29
+; VGPRRC-NEXT: v_mov_b32_e32 v46, s30
+; VGPRRC-NEXT: v_mov_b32_e32 v47, s31
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -3719,45 +3711,45 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0
; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[40:43], v[44:47], v[0:15] cbsz:2 abid:3 blgp:1
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 3
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s16
; VGPRRC-NEXT: v_mov_b32_e32 v1, s17
; VGPRRC-NEXT: v_mov_b32_e32 v2, s18
; VGPRRC-NEXT: v_mov_b32_e32 v3, s19
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
@@ -4254,17 +4246,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v6, s24
+; SDAG-NEXT: v_mov_b32_e32 v7, s25
+; SDAG-NEXT: v_mov_b32_e32 v8, s26
+; SDAG-NEXT: v_mov_b32_e32 v9, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
@@ -4283,41 +4275,41 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31]
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4379,17 +4371,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v2, s20
+; HEURRC-NEXT: v_mov_b32_e32 v3, s21
+; HEURRC-NEXT: v_mov_b32_e32 v4, s22
+; HEURRC-NEXT: v_mov_b32_e32 v5, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v4, s24
-; HEURRC-NEXT: v_mov_b32_e32 v5, s25
-; HEURRC-NEXT: v_mov_b32_e32 v6, s26
-; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_mov_b32_e32 v6, s24
+; HEURRC-NEXT: v_mov_b32_e32 v7, s25
+; HEURRC-NEXT: v_mov_b32_e32 v8, s26
+; HEURRC-NEXT: v_mov_b32_e32 v9, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
@@ -4408,41 +4400,41 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31]
+; HEURRC-NEXT: v_mov_b32_e32 v2, s20
+; HEURRC-NEXT: v_mov_b32_e32 v3, s21
+; HEURRC-NEXT: v_mov_b32_e32 v4, s22
+; HEURRC-NEXT: v_mov_b32_e32 v5, s23
+; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s16
+; HEURRC-NEXT: v_mov_b32_e32 v3, s17
+; HEURRC-NEXT: v_mov_b32_e32 v4, s18
+; HEURRC-NEXT: v_mov_b32_e32 v5, s19
+; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s12
-; HEURRC-NEXT: v_mov_b32_e32 v1, s13
-; HEURRC-NEXT: v_mov_b32_e32 v2, s14
-; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s12
+; HEURRC-NEXT: v_mov_b32_e32 v3, s13
+; HEURRC-NEXT: v_mov_b32_e32 v4, s14
+; HEURRC-NEXT: v_mov_b32_e32 v5, s15
+; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s8
+; HEURRC-NEXT: v_mov_b32_e32 v3, s9
+; HEURRC-NEXT: v_mov_b32_e32 v4, s10
+; HEURRC-NEXT: v_mov_b32_e32 v5, s11
+; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -4450,17 +4442,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v32, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v32, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v33, s21
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT: v_mov_b32_e32 v34, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v35, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v36, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v37, s23
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b32_e32 v36, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v37, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v38, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v39, s27
+; VGPRRC-NEXT: v_mov_b32_e32 v38, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v39, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s27
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
@@ -4471,42 +4463,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31]
; VGPRRC-NEXT: s_nop 6
; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
; VGPRRC-NEXT: v_mov_b32_e32 v18, s22
; VGPRRC-NEXT: v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
@@ -4653,17 +4645,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v6, s24
+; SDAG-NEXT: v_mov_b32_e32 v7, s25
+; SDAG-NEXT: v_mov_b32_e32 v8, s26
+; SDAG-NEXT: v_mov_b32_e32 v9, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
@@ -4682,41 +4674,41 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4778,17 +4770,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v2, s20
+; HEURRC-NEXT: v_mov_b32_e32 v3, s21
+; HEURRC-NEXT: v_mov_b32_e32 v4, s22
+; HEURRC-NEXT: v_mov_b32_e32 v5, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v4, s24
-; HEURRC-NEXT: v_mov_b32_e32 v5, s25
-; HEURRC-NEXT: v_mov_b32_e32 v6, s26
-; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_mov_b32_e32 v6, s24
+; HEURRC-NEXT: v_mov_b32_e32 v7, s25
+; HEURRC-NEXT: v_mov_b32_e32 v8, s26
+; HEURRC-NEXT: v_mov_b32_e32 v9, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
@@ -4807,41 +4799,41 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT: v_mov_b32_e32 v2, s20
+; HEURRC-NEXT: v_mov_b32_e32 v3, s21
+; HEURRC-NEXT: v_mov_b32_e32 v4, s22
+; HEURRC-NEXT: v_mov_b32_e32 v5, s23
+; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s16
+; HEURRC-NEXT: v_mov_b32_e32 v3, s17
+; HEURRC-NEXT: v_mov_b32_e32 v4, s18
+; HEURRC-NEXT: v_mov_b32_e32 v5, s19
+; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s12
-; HEURRC-NEXT: v_mov_b32_e32 v1, s13
-; HEURRC-NEXT: v_mov_b32_e32 v2, s14
-; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s12
+; HEURRC-NEXT: v_mov_b32_e32 v3, s13
+; HEURRC-NEXT: v_mov_b32_e32 v4, s14
+; HEURRC-NEXT: v_mov_b32_e32 v5, s15
+; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s8
+; HEURRC-NEXT: v_mov_b32_e32 v3, s9
+; HEURRC-NEXT: v_mov_b32_e32 v4, s10
+; HEURRC-NEXT: v_mov_b32_e32 v5, s11
+; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -4849,17 +4841,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v32, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v32, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v33, s21
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT: v_mov_b32_e32 v34, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v35, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v36, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v37, s23
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b32_e32 v36, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v37, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v38, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v39, s27
+; VGPRRC-NEXT: v_mov_b32_e32 v38, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v39, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s27
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
@@ -4870,42 +4862,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31] cbsz:1 abid:2 blgp:3
; VGPRRC-NEXT: s_nop 6
; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
; VGPRRC-NEXT: v_mov_b32_e32 v18, s22
; VGPRRC-NEXT: v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index ff305da49852c..a959ead39c4ac 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -1114,19 +1114,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v2
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v0
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v1
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s15
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s14
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s15
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s0
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s0
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s1
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v6
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v2
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v3
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s3
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v0
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v1
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s2
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s3
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 1
@@ -1254,19 +1254,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v2
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v0
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v1
-; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14
-; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s15
+; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s14
+; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s15
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s0
-; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1
+; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s1
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v6
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v2
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v3
-; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2
-; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s3
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v0
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v1
+; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s2
+; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s3
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 1
@@ -1330,7 +1330,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX90A-LABEL: test_mfma_f32_32x32x4f16:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40
; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0
@@ -1345,8 +1345,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NEXT: v_mov_b32_e32 v2, s0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s1
; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20
; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21
; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22
@@ -1371,27 +1371,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13
; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s3
+; GFX90A-NEXT: v_mov_b32_e32 v4, s2
+; GFX90A-NEXT: v_mov_b32_e32 v5, s3
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: global_store_dwordx4 v4, a[24:27], s[36:37] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v4, a[28:31], s[36:37] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v4, a[16:19], s[36:37] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v4, a[20:23], s[36:37] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v4, a[8:11], s[36:37] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v4, a[12:15], s[36:37] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[36:37]
-; GFX90A-NEXT: global_store_dwordx4 v4, a[4:7], s[36:37] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[36:37] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[36:37] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[36:37] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[36:37] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[36:37]
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[36:37] offset:16
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_32x32x4f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40
; GFX942-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0
@@ -1406,8 +1406,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: v_accvgpr_write_b32 a2, s18
; GFX942-NEXT: v_accvgpr_write_b32 a3, s19
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s0
-; GFX942-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NEXT: v_mov_b32_e32 v2, s0
+; GFX942-NEXT: v_mov_b32_e32 v3, s1
; GFX942-NEXT: v_accvgpr_write_b32 a4, s20
; GFX942-NEXT: v_accvgpr_write_b32 a5, s21
; GFX942-NEXT: v_accvgpr_write_b32 a6, s22
@@ -1432,21 +1432,21 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: v_accvgpr_write_b32 a29, s13
; GFX942-NEXT: v_accvgpr_write_b32 a30, s14
; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
-; GFX942-NEXT: v_mov_b32_e32 v2, s2
-; GFX942-NEXT: v_mov_b32_e32 v3, s3
+; GFX942-NEXT: v_mov_b32_e32 v4, s2
+; GFX942-NEXT: v_mov_b32_e32 v5, s3
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v4, a[24:27], s[36:37] offset:96
-; GFX942-NEXT: global_store_dwordx4 v4, a[28:31], s[36:37] offset:112
-; GFX942-NEXT: global_store_dwordx4 v4, a[16:19], s[36:37] offset:64
-; GFX942-NEXT: global_store_dwordx4 v4, a[20:23], s[36:37] offset:80
-; GFX942-NEXT: global_store_dwordx4 v4, a[8:11], s[36:37] offset:32
-; GFX942-NEXT: global_store_dwordx4 v4, a[12:15], s[36:37] offset:48
-; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[36:37]
-; GFX942-NEXT: global_store_dwordx4 v4, a[4:7], s[36:37] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[36:37] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[36:37] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[36:37] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[36:37] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[36:37]
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[36:37] offset:16
; GFX942-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -1752,45 +1752,45 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add
; GFX90A-LABEL: test_mfma_f32_4x4x4f16:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: v_mov_b32_e32 v3, s5
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX90A-NEXT: v_mov_b32_e32 v2, s6
-; GFX90A-NEXT: v_mov_b32_e32 v3, s7
+; GFX90A-NEXT: v_mov_b32_e32 v4, s6
+; GFX90A-NEXT: v_mov_b32_e32 v5, s7
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9
; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 4
-; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_4x4x4f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s4
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-NEXT: v_mov_b32_e32 v2, s4
+; GFX942-NEXT: v_mov_b32_e32 v3, s5
; GFX942-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX942-NEXT: v_mov_b32_e32 v2, s6
-; GFX942-NEXT: v_mov_b32_e32 v3, s7
+; GFX942-NEXT: v_mov_b32_e32 v4, s6
+; GFX942-NEXT: v_mov_b32_e32 v5, s7
; GFX942-NEXT: v_accvgpr_write_b32 a1, s9
; GFX942-NEXT: v_accvgpr_write_b32 a2, s10
; GFX942-NEXT: v_accvgpr_write_b32 a3, s11
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_4x4x4_16b_f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_4x4x4_16b_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 4
-; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -2099,46 +2099,46 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
; GFX90A-LABEL: test_mfma_f32_16x16x16f16:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: v_mov_b32_e32 v3, s5
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX90A-NEXT: v_mov_b32_e32 v2, s6
-; GFX90A-NEXT: v_mov_b32_e32 v3, s7
+; GFX90A-NEXT: v_mov_b32_e32 v4, s6
+; GFX90A-NEXT: v_mov_b32_e32 v5, s7
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9
; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_16x16x16f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s4
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-NEXT: v_mov_b32_e32 v2, s4
+; GFX942-NEXT: v_mov_b32_e32 v3, s5
; GFX942-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX942-NEXT: v_mov_b32_e32 v2, s6
-; GFX942-NEXT: v_mov_b32_e32 v3, s7
+; GFX942-NEXT: v_mov_b32_e32 v4, s6
+; GFX942-NEXT: v_mov_b32_e32 v5, s7
; GFX942-NEXT: v_accvgpr_write_b32 a1, s9
; GFX942-NEXT: v_accvgpr_write_b32 a2, s10
; GFX942-NEXT: v_accvgpr_write_b32 a3, s11
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 6
-; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 04ee0bbd17673..37809da10241b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -1485,30 +1485,30 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v12, s0
-; SDAG-NEXT: v_mov_b32_e32 v13, s1
-; SDAG-NEXT: v_mov_b32_e32 v14, s2
-; SDAG-NEXT: v_mov_b32_e32 v15, s3
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
-; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mov_b32_e32 v18, s18
-; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: v_mov_b32_e32 v20, s28
-; SDAG-NEXT: v_mov_b32_e32 v21, s29
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
-; SDAG-NEXT: v_mov_b32_e32 v8, s24
-; SDAG-NEXT: v_mov_b32_e32 v9, s25
-; SDAG-NEXT: v_mov_b32_e32 v10, s26
-; SDAG-NEXT: v_mov_b32_e32 v11, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v21
+; SDAG-NEXT: v_mov_b32_e32 v14, s0
+; SDAG-NEXT: v_mov_b32_e32 v15, s1
+; SDAG-NEXT: v_mov_b32_e32 v16, s2
+; SDAG-NEXT: v_mov_b32_e32 v17, s3
+; SDAG-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s24
+; SDAG-NEXT: v_mov_b32_e32 v11, s25
+; SDAG-NEXT: v_mov_b32_e32 v12, s26
+; SDAG-NEXT: v_mov_b32_e32 v13, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v4
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v5
; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1895,36 +1895,36 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s16
+; SDAG-NEXT: v_mov_b32_e32 v11, s17
+; SDAG-NEXT: v_mov_b32_e32 v12, s18
+; SDAG-NEXT: v_mov_b32_e32 v13, s19
+; SDAG-NEXT: v_mov_b32_e32 v14, s20
+; SDAG-NEXT: v_mov_b32_e32 v15, s21
+; SDAG-NEXT: v_mov_b32_e32 v16, s22
+; SDAG-NEXT: v_mov_b32_e32 v17, s23
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s12, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15]
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[14:15]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
@@ -1964,33 +1964,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
; SDAG-NEXT: s_movk_i32 s6, 0x41
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s16
+; SDAG-NEXT: v_mov_b32_e32 v11, s17
+; SDAG-NEXT: v_mov_b32_e32 v12, s18
+; SDAG-NEXT: v_mov_b32_e32 v13, s19
+; SDAG-NEXT: v_mov_b32_e32 v14, s20
+; SDAG-NEXT: v_mov_b32_e32 v15, s21
+; SDAG-NEXT: v_mov_b32_e32 v16, s22
+; SDAG-NEXT: v_mov_b32_e32 v17, s23
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
@@ -2031,33 +2031,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
; SDAG-NEXT: s_movk_i32 s6, 0x41
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s16
+; SDAG-NEXT: v_mov_b32_e32 v11, s17
+; SDAG-NEXT: v_mov_b32_e32 v12, s18
+; SDAG-NEXT: v_mov_b32_e32 v13, s19
+; SDAG-NEXT: v_mov_b32_e32 v14, s20
+; SDAG-NEXT: v_mov_b32_e32 v15, s21
+; SDAG-NEXT: v_mov_b32_e32 v16, s22
+; SDAG-NEXT: v_mov_b32_e32 v17, s23
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
@@ -2096,34 +2096,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s16
+; SDAG-NEXT: v_mov_b32_e32 v11, s17
+; SDAG-NEXT: v_mov_b32_e32 v12, s18
+; SDAG-NEXT: v_mov_b32_e32 v13, s19
+; SDAG-NEXT: v_mov_b32_e32 v14, s20
+; SDAG-NEXT: v_mov_b32_e32 v15, s21
+; SDAG-NEXT: v_mov_b32_e32 v16, s22
+; SDAG-NEXT: v_mov_b32_e32 v17, s23
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
@@ -2162,34 +2162,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s16
+; SDAG-NEXT: v_mov_b32_e32 v11, s17
+; SDAG-NEXT: v_mov_b32_e32 v12, s18
+; SDAG-NEXT: v_mov_b32_e32 v13, s19
+; SDAG-NEXT: v_mov_b32_e32 v14, s20
+; SDAG-NEXT: v_mov_b32_e32 v15, s21
+; SDAG-NEXT: v_mov_b32_e32 v16, s22
+; SDAG-NEXT: v_mov_b32_e32 v17, s23
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 91197f915b659..bc50058778dbf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -3515,26 +3515,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v24, s0
-; SDAG-NEXT: v_mov_b32_e32 v25, s1
-; SDAG-NEXT: v_mov_b32_e32 v26, s2
-; SDAG-NEXT: v_mov_b32_e32 v27, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s28
-; SDAG-NEXT: v_mov_b32_e32 v33, s29
-; SDAG-NEXT: v_mov_b32_e32 v16, s20
-; SDAG-NEXT: v_mov_b32_e32 v17, s21
-; SDAG-NEXT: v_mov_b32_e32 v18, s22
-; SDAG-NEXT: v_mov_b32_e32 v19, s23
-; SDAG-NEXT: v_mov_b32_e32 v20, s24
-; SDAG-NEXT: v_mov_b32_e32 v21, s25
-; SDAG-NEXT: v_mov_b32_e32 v22, s26
-; SDAG-NEXT: v_mov_b32_e32 v23, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v32
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v33
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v30, s16
+; SDAG-NEXT: v_mov_b32_e32 v31, s17
+; SDAG-NEXT: v_mov_b32_e32 v32, s18
+; SDAG-NEXT: v_mov_b32_e32 v33, s19
+; SDAG-NEXT: v_mov_b32_e32 v16, s28
+; SDAG-NEXT: v_mov_b32_e32 v17, s29
+; SDAG-NEXT: v_mov_b32_e32 v18, s20
+; SDAG-NEXT: v_mov_b32_e32 v19, s21
+; SDAG-NEXT: v_mov_b32_e32 v20, s22
+; SDAG-NEXT: v_mov_b32_e32 v21, s23
+; SDAG-NEXT: v_mov_b32_e32 v22, s24
+; SDAG-NEXT: v_mov_b32_e32 v23, s25
+; SDAG-NEXT: v_mov_b32_e32 v24, s26
+; SDAG-NEXT: v_mov_b32_e32 v25, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
; SDAG-NEXT: v_accvgpr_write_b32 a4, v2
@@ -3550,7 +3550,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[24:31], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[18:25], a[0:15], v14, v15 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -3993,34 +3993,34 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v16, s0
-; SDAG-NEXT: v_mov_b32_e32 v17, s1
-; SDAG-NEXT: v_mov_b32_e32 v18, s2
-; SDAG-NEXT: v_mov_b32_e32 v19, s3
-; SDAG-NEXT: v_mov_b32_e32 v20, s16
-; SDAG-NEXT: v_mov_b32_e32 v21, s17
-; SDAG-NEXT: v_mov_b32_e32 v22, s18
-; SDAG-NEXT: v_mov_b32_e32 v23, s19
-; SDAG-NEXT: v_mov_b32_e32 v24, s20
-; SDAG-NEXT: v_mov_b32_e32 v25, s21
-; SDAG-NEXT: v_mov_b32_e32 v26, s22
-; SDAG-NEXT: v_mov_b32_e32 v27, s23
-; SDAG-NEXT: v_mov_b32_e32 v28, s24
-; SDAG-NEXT: v_mov_b32_e32 v29, s25
-; SDAG-NEXT: v_mov_b32_e32 v30, s26
-; SDAG-NEXT: v_mov_b32_e32 v31, s27
-; SDAG-NEXT: v_mov_b32_e32 v32, s28
-; SDAG-NEXT: v_mov_b32_e32 v33, s29
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v31
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v32
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v33
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v30, s16
+; SDAG-NEXT: v_mov_b32_e32 v31, s17
+; SDAG-NEXT: v_mov_b32_e32 v32, s18
+; SDAG-NEXT: v_mov_b32_e32 v33, s19
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b32_e32 v20, s24
+; SDAG-NEXT: v_mov_b32_e32 v21, s25
+; SDAG-NEXT: v_mov_b32_e32 v22, s26
+; SDAG-NEXT: v_mov_b32_e32 v23, s27
+; SDAG-NEXT: v_mov_b32_e32 v24, s28
+; SDAG-NEXT: v_mov_b32_e32 v25, s29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
; SDAG-NEXT: v_accvgpr_write_b32 a10, v8
; SDAG-NEXT: v_accvgpr_write_b32 a11, v9
; SDAG-NEXT: v_accvgpr_write_b32 a12, v10
@@ -4028,7 +4028,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -4540,22 +4540,22 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a0, s36
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: v_mov_b32_e32 v3, s9
+; SDAG-NEXT: v_mov_b32_e32 v4, s10
+; SDAG-NEXT: v_mov_b32_e32 v5, s11
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v10, s16
+; SDAG-NEXT: v_mov_b32_e32 v11, s17
+; SDAG-NEXT: v_mov_b32_e32 v12, s18
+; SDAG-NEXT: v_mov_b32_e32 v13, s19
+; SDAG-NEXT: v_mov_b32_e32 v14, s20
+; SDAG-NEXT: v_mov_b32_e32 v15, s21
+; SDAG-NEXT: v_mov_b32_e32 v16, s22
+; SDAG-NEXT: v_mov_b32_e32 v17, s23
; SDAG-NEXT: v_accvgpr_write_b32 a1, s37
; SDAG-NEXT: v_accvgpr_write_b32 a2, s38
; SDAG-NEXT: v_accvgpr_write_b32 a3, s39
@@ -4571,9 +4571,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
; SDAG-NEXT: v_accvgpr_write_b32 a13, s49
; SDAG-NEXT: v_accvgpr_write_b32 a14, s50
; SDAG-NEXT: v_accvgpr_write_b32 a15, s51
-; SDAG-NEXT: v_mov_b32_e32 v16, s1
+; SDAG-NEXT: v_mov_b32_e32 v0, s1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], s0, v0 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: v_mov_b32_e32 v0, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
@@ -4735,26 +4735,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s16
+; SDAG-NEXT: v_mov_b32_e32 v7, s17
+; SDAG-NEXT: v_mov_b32_e32 v8, s18
+; SDAG-NEXT: v_mov_b32_e32 v9, s19
+; SDAG-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-NEXT: v_mov_b32_e32 v11, s21
+; SDAG-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-NEXT: v_mov_b32_e32 v13, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: v_mov_b32_e32 v14, s24
+; SDAG-NEXT: v_mov_b32_e32 v15, s25
+; SDAG-NEXT: v_mov_b32_e32 v16, s26
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v17, s27
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -4770,45 +4770,44 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b32_e32 v16, s1
+; SDAG-NEXT: v_mov_b32_e32 v0, s1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0]
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], s0, v0 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4922,42 +4921,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -5069,42 +5067,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -5216,42 +5213,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index 6b7e8cba00a36..fb1e46d955752 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -8,26 +8,6 @@ declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>,
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32)
define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 {
-; GFX942-LABEL: test_mfma_f32_16x16x8xf32:
-; GFX942: ; %bb.0: ; %bb
-; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v2, 0x40400000
-; GFX942-NEXT: v_mov_b32_e32 v3, 4.0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: s_nop 6
-; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -36,42 +16,6 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
-; GFX942-LABEL: test_mfma_f32_32x32x4xf32:
-; GFX942: ; %bb.0: ; %bb
-; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v2, 0x40400000
-; GFX942-NEXT: v_mov_b32_e32 v3, 4.0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GFX942-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -82,4 +26,5 @@ bb:
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
+; GFX942: {{.*}}
; GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 77d4aad5f3174..80568810e42b5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -17,24 +17,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v13, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
@@ -44,23 +44,23 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b32_e32 v16, s16
+; GISEL-NEXT: v_mov_b32_e32 v12, s16
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -120,25 +120,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <
; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -547,24 +547,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1)
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GCN-NEXT: v_mov_b32_e32 v16, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; GCN-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
-; GCN-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
-; GCN-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT: v_mov_b32_e32 v17, s16
+; GCN-NEXT: v_mov_b32_e32 v13, s16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
; GCN-NEXT: s_nop 7
-; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; GCN-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
; GCN-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -624,25 +624,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v8, s0
-; GCN-NEXT: v_mov_b32_e32 v9, s1
-; GCN-NEXT: v_mov_b32_e32 v10, s2
-; GCN-NEXT: v_mov_b32_e32 v11, s3
-; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NEXT: v_mov_b32_e32 v3, s19
-; GCN-NEXT: v_mov_b32_e32 v4, s20
-; GCN-NEXT: v_mov_b32_e32 v5, s21
-; GCN-NEXT: v_mov_b32_e32 v6, s22
-; GCN-NEXT: v_mov_b32_e32 v7, s23
+; GCN-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NEXT: v_mov_b32_e32 v11, s1
+; GCN-NEXT: v_mov_b32_e32 v12, s2
+; GCN-NEXT: v_mov_b32_e32 v13, s3
+; GCN-NEXT: v_mov_b32_e32 v2, s16
+; GCN-NEXT: v_mov_b32_e32 v3, s17
+; GCN-NEXT: v_mov_b32_e32 v4, s18
+; GCN-NEXT: v_mov_b32_e32 v5, s19
+; GCN-NEXT: v_mov_b32_e32 v6, s20
+; GCN-NEXT: v_mov_b32_e32 v7, s21
+; GCN-NEXT: v_mov_b32_e32 v8, s22
+; GCN-NEXT: v_mov_b32_e32 v9, s23
; GCN-NEXT: v_accvgpr_write_b32 a0, s24
; GCN-NEXT: v_accvgpr_write_b32 a1, s25
; GCN-NEXT: v_accvgpr_write_b32 a2, s26
; GCN-NEXT: v_accvgpr_write_b32 a3, s27
-; GCN-NEXT: v_mov_b32_e32 v12, s28
+; GCN-NEXT: v_mov_b32_e32 v0, s28
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12
+; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[10:13], v[2:9], v0
; GCN-NEXT: s_nop 7
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
@@ -855,30 +855,30 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v12, s8
-; SDAG-NEXT: v_mov_b32_e32 v13, s9
-; SDAG-NEXT: v_mov_b32_e32 v14, s10
-; SDAG-NEXT: v_mov_b32_e32 v15, s11
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v14, s8
+; SDAG-NEXT: v_mov_b32_e32 v15, s9
+; SDAG-NEXT: v_mov_b32_e32 v16, s10
+; SDAG-NEXT: v_mov_b32_e32 v17, s11
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v4, s0
-; SDAG-NEXT: v_mov_b32_e32 v5, s1
-; SDAG-NEXT: v_mov_b32_e32 v6, s2
-; SDAG-NEXT: v_mov_b32_e32 v7, s3
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v6, s0
+; SDAG-NEXT: v_mov_b32_e32 v7, s1
+; SDAG-NEXT: v_mov_b32_e32 v8, s2
+; SDAG-NEXT: v_mov_b32_e32 v9, s3
+; SDAG-NEXT: v_mov_b32_e32 v1, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
@@ -887,24 +887,24 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -964,25 +964,25 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1032,22 +1032,22 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v24, s8
-; SDAG-NEXT: v_mov_b32_e32 v25, s9
-; SDAG-NEXT: v_mov_b32_e32 v26, s10
-; SDAG-NEXT: v_mov_b32_e32 v27, s11
-; SDAG-NEXT: v_mov_b32_e32 v16, s12
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
-; SDAG-NEXT: v_mov_b32_e32 v18, s14
-; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s0
-; SDAG-NEXT: v_mov_b32_e32 v21, s1
-; SDAG-NEXT: v_mov_b32_e32 v22, s2
-; SDAG-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v26, s8
+; SDAG-NEXT: v_mov_b32_e32 v27, s9
+; SDAG-NEXT: v_mov_b32_e32 v28, s10
+; SDAG-NEXT: v_mov_b32_e32 v29, s11
+; SDAG-NEXT: v_mov_b32_e32 v18, s12
+; SDAG-NEXT: v_mov_b32_e32 v19, s13
+; SDAG-NEXT: v_mov_b32_e32 v20, s14
+; SDAG-NEXT: v_mov_b32_e32 v21, s15
+; SDAG-NEXT: v_mov_b32_e32 v22, s0
+; SDAG-NEXT: v_mov_b32_e32 v23, s1
+; SDAG-NEXT: v_mov_b32_e32 v24, s2
+; SDAG-NEXT: v_mov_b32_e32 v25, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
@@ -1397,30 +1397,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v12, s8
-; SDAG-NEXT: v_mov_b32_e32 v13, s9
-; SDAG-NEXT: v_mov_b32_e32 v14, s10
-; SDAG-NEXT: v_mov_b32_e32 v15, s11
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v14, s8
+; SDAG-NEXT: v_mov_b32_e32 v15, s9
+; SDAG-NEXT: v_mov_b32_e32 v16, s10
+; SDAG-NEXT: v_mov_b32_e32 v17, s11
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v4, s0
-; SDAG-NEXT: v_mov_b32_e32 v5, s1
-; SDAG-NEXT: v_mov_b32_e32 v6, s2
-; SDAG-NEXT: v_mov_b32_e32 v7, s3
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v6, s0
+; SDAG-NEXT: v_mov_b32_e32 v7, s1
+; SDAG-NEXT: v_mov_b32_e32 v8, s2
+; SDAG-NEXT: v_mov_b32_e32 v9, s3
+; SDAG-NEXT: v_mov_b32_e32 v1, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
@@ -1429,24 +1429,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1506,25 +1506,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1566,30 +1566,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v12, s8
-; SDAG-NEXT: v_mov_b32_e32 v13, s9
-; SDAG-NEXT: v_mov_b32_e32 v14, s10
-; SDAG-NEXT: v_mov_b32_e32 v15, s11
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v14, s8
+; SDAG-NEXT: v_mov_b32_e32 v15, s9
+; SDAG-NEXT: v_mov_b32_e32 v16, s10
+; SDAG-NEXT: v_mov_b32_e32 v17, s11
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v4, s0
-; SDAG-NEXT: v_mov_b32_e32 v5, s1
-; SDAG-NEXT: v_mov_b32_e32 v6, s2
-; SDAG-NEXT: v_mov_b32_e32 v7, s3
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v6, s0
+; SDAG-NEXT: v_mov_b32_e32 v7, s1
+; SDAG-NEXT: v_mov_b32_e32 v8, s2
+; SDAG-NEXT: v_mov_b32_e32 v9, s3
+; SDAG-NEXT: v_mov_b32_e32 v1, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
@@ -1598,24 +1598,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1675,25 +1675,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1735,30 +1735,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v12, s8
-; SDAG-NEXT: v_mov_b32_e32 v13, s9
-; SDAG-NEXT: v_mov_b32_e32 v14, s10
-; SDAG-NEXT: v_mov_b32_e32 v15, s11
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v14, s8
+; SDAG-NEXT: v_mov_b32_e32 v15, s9
+; SDAG-NEXT: v_mov_b32_e32 v16, s10
+; SDAG-NEXT: v_mov_b32_e32 v17, s11
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v4, s0
-; SDAG-NEXT: v_mov_b32_e32 v5, s1
-; SDAG-NEXT: v_mov_b32_e32 v6, s2
-; SDAG-NEXT: v_mov_b32_e32 v7, s3
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v6, s0
+; SDAG-NEXT: v_mov_b32_e32 v7, s1
+; SDAG-NEXT: v_mov_b32_e32 v8, s2
+; SDAG-NEXT: v_mov_b32_e32 v9, s3
+; SDAG-NEXT: v_mov_b32_e32 v1, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
@@ -1767,24 +1767,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1844,25 +1844,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1904,30 +1904,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v12, s8
-; SDAG-NEXT: v_mov_b32_e32 v13, s9
-; SDAG-NEXT: v_mov_b32_e32 v14, s10
-; SDAG-NEXT: v_mov_b32_e32 v15, s11
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v14, s8
+; SDAG-NEXT: v_mov_b32_e32 v15, s9
+; SDAG-NEXT: v_mov_b32_e32 v16, s10
+; SDAG-NEXT: v_mov_b32_e32 v17, s11
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v4, s0
-; SDAG-NEXT: v_mov_b32_e32 v5, s1
-; SDAG-NEXT: v_mov_b32_e32 v6, s2
-; SDAG-NEXT: v_mov_b32_e32 v7, s3
-; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: v_mov_b32_e32 v6, s0
+; SDAG-NEXT: v_mov_b32_e32 v7, s1
+; SDAG-NEXT: v_mov_b32_e32 v8, s2
+; SDAG-NEXT: v_mov_b32_e32 v9, s3
+; SDAG-NEXT: v_mov_b32_e32 v1, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
@@ -1936,24 +1936,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2013,25 +2013,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -2081,22 +2081,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v24, s8
-; SDAG-NEXT: v_mov_b32_e32 v25, s9
-; SDAG-NEXT: v_mov_b32_e32 v26, s10
-; SDAG-NEXT: v_mov_b32_e32 v27, s11
-; SDAG-NEXT: v_mov_b32_e32 v16, s12
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
-; SDAG-NEXT: v_mov_b32_e32 v18, s14
-; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s0
-; SDAG-NEXT: v_mov_b32_e32 v21, s1
-; SDAG-NEXT: v_mov_b32_e32 v22, s2
-; SDAG-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v26, s8
+; SDAG-NEXT: v_mov_b32_e32 v27, s9
+; SDAG-NEXT: v_mov_b32_e32 v28, s10
+; SDAG-NEXT: v_mov_b32_e32 v29, s11
+; SDAG-NEXT: v_mov_b32_e32 v18, s12
+; SDAG-NEXT: v_mov_b32_e32 v19, s13
+; SDAG-NEXT: v_mov_b32_e32 v20, s14
+; SDAG-NEXT: v_mov_b32_e32 v21, s15
+; SDAG-NEXT: v_mov_b32_e32 v22, s0
+; SDAG-NEXT: v_mov_b32_e32 v23, s1
+; SDAG-NEXT: v_mov_b32_e32 v24, s2
+; SDAG-NEXT: v_mov_b32_e32 v25, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
@@ -2454,22 +2454,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v24, s8
-; SDAG-NEXT: v_mov_b32_e32 v25, s9
-; SDAG-NEXT: v_mov_b32_e32 v26, s10
-; SDAG-NEXT: v_mov_b32_e32 v27, s11
-; SDAG-NEXT: v_mov_b32_e32 v16, s12
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
-; SDAG-NEXT: v_mov_b32_e32 v18, s14
-; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s0
-; SDAG-NEXT: v_mov_b32_e32 v21, s1
-; SDAG-NEXT: v_mov_b32_e32 v22, s2
-; SDAG-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v26, s8
+; SDAG-NEXT: v_mov_b32_e32 v27, s9
+; SDAG-NEXT: v_mov_b32_e32 v28, s10
+; SDAG-NEXT: v_mov_b32_e32 v29, s11
+; SDAG-NEXT: v_mov_b32_e32 v18, s12
+; SDAG-NEXT: v_mov_b32_e32 v19, s13
+; SDAG-NEXT: v_mov_b32_e32 v20, s14
+; SDAG-NEXT: v_mov_b32_e32 v21, s15
+; SDAG-NEXT: v_mov_b32_e32 v22, s0
+; SDAG-NEXT: v_mov_b32_e32 v23, s1
+; SDAG-NEXT: v_mov_b32_e32 v24, s2
+; SDAG-NEXT: v_mov_b32_e32 v25, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
@@ -2827,22 +2827,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v24, s8
-; SDAG-NEXT: v_mov_b32_e32 v25, s9
-; SDAG-NEXT: v_mov_b32_e32 v26, s10
-; SDAG-NEXT: v_mov_b32_e32 v27, s11
-; SDAG-NEXT: v_mov_b32_e32 v16, s12
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
-; SDAG-NEXT: v_mov_b32_e32 v18, s14
-; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s0
-; SDAG-NEXT: v_mov_b32_e32 v21, s1
-; SDAG-NEXT: v_mov_b32_e32 v22, s2
-; SDAG-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v26, s8
+; SDAG-NEXT: v_mov_b32_e32 v27, s9
+; SDAG-NEXT: v_mov_b32_e32 v28, s10
+; SDAG-NEXT: v_mov_b32_e32 v29, s11
+; SDAG-NEXT: v_mov_b32_e32 v18, s12
+; SDAG-NEXT: v_mov_b32_e32 v19, s13
+; SDAG-NEXT: v_mov_b32_e32 v20, s14
+; SDAG-NEXT: v_mov_b32_e32 v21, s15
+; SDAG-NEXT: v_mov_b32_e32 v22, s0
+; SDAG-NEXT: v_mov_b32_e32 v23, s1
+; SDAG-NEXT: v_mov_b32_e32 v24, s2
+; SDAG-NEXT: v_mov_b32_e32 v25, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
@@ -3200,22 +3200,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v24, s8
-; SDAG-NEXT: v_mov_b32_e32 v25, s9
-; SDAG-NEXT: v_mov_b32_e32 v26, s10
-; SDAG-NEXT: v_mov_b32_e32 v27, s11
-; SDAG-NEXT: v_mov_b32_e32 v16, s12
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
-; SDAG-NEXT: v_mov_b32_e32 v18, s14
-; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s0
-; SDAG-NEXT: v_mov_b32_e32 v21, s1
-; SDAG-NEXT: v_mov_b32_e32 v22, s2
-; SDAG-NEXT: v_mov_b32_e32 v23, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v26, s8
+; SDAG-NEXT: v_mov_b32_e32 v27, s9
+; SDAG-NEXT: v_mov_b32_e32 v28, s10
+; SDAG-NEXT: v_mov_b32_e32 v29, s11
+; SDAG-NEXT: v_mov_b32_e32 v18, s12
+; SDAG-NEXT: v_mov_b32_e32 v19, s13
+; SDAG-NEXT: v_mov_b32_e32 v20, s14
+; SDAG-NEXT: v_mov_b32_e32 v21, s15
+; SDAG-NEXT: v_mov_b32_e32 v22, s0
+; SDAG-NEXT: v_mov_b32_e32 v23, s1
+; SDAG-NEXT: v_mov_b32_e32 v24, s2
+; SDAG-NEXT: v_mov_b32_e32 v25, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 9c38d7f5e2a05..5b0d2d22c3e3d 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -3565,13 +3565,13 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x
; PACKED-SDAG-LABEL: fneg_v2f32_scalar:
; PACKED-SDAG: ; %bb.0:
; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, 0
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000
; PACKED-SDAG-NEXT: s_xor_b32 s2, s2, 0x80000000
-; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; PACKED-SDAG-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; PACKED-SDAG-NEXT: s_endpgm
;
; PACKED-GISEL-LABEL: fneg_v2f32_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index a5c8f0480c776..c3164b8dd07ef 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -71,26 +71,19 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
;
; PEI-GFX90A-LABEL: name: partial_copy
; PEI-GFX90A: bb.0 (%ir-block.0):
- ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
+ ; PEI-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; PEI-GFX90A-NEXT: {{ $}}
- ; PEI-GFX90A-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
- ; PEI-GFX90A-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
- ; PEI-GFX90A-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
- ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
- ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
- ; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
+ ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
; PEI-GFX90A-NEXT: renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
; PEI-GFX90A-NEXT: renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
- ; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
- ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: S_ENDPGM 0
call void asm sideeffect "; use $0", "a" (i32 poison)
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index d48bfe0bb7f21..68ef30a90646e 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -53,31 +53,31 @@ define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s20
-; GFX942-NEXT: v_mov_b32_e32 v1, s21
-; GFX942-NEXT: v_mov_b32_e32 v2, s22
-; GFX942-NEXT: v_mov_b32_e32 v3, s23
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX942-NEXT: v_mov_b32_e32 v2, s20
+; GFX942-NEXT: v_mov_b32_e32 v3, s21
+; GFX942-NEXT: v_mov_b32_e32 v4, s22
+; GFX942-NEXT: v_mov_b32_e32 v5, s23
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v0, s16
-; GFX942-NEXT: v_mov_b32_e32 v1, s17
-; GFX942-NEXT: v_mov_b32_e32 v2, s18
-; GFX942-NEXT: v_mov_b32_e32 v3, s19
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX942-NEXT: v_mov_b32_e32 v2, s16
+; GFX942-NEXT: v_mov_b32_e32 v3, s17
+; GFX942-NEXT: v_mov_b32_e32 v4, s18
+; GFX942-NEXT: v_mov_b32_e32 v5, s19
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v0, s12
-; GFX942-NEXT: v_mov_b32_e32 v1, s13
-; GFX942-NEXT: v_mov_b32_e32 v2, s14
-; GFX942-NEXT: v_mov_b32_e32 v3, s15
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-NEXT: v_mov_b32_e32 v3, s13
+; GFX942-NEXT: v_mov_b32_e32 v4, s14
+; GFX942-NEXT: v_mov_b32_e32 v5, s15
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: v_mov_b32_e32 v1, s9
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: v_mov_b32_e32 v3, s11
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-NEXT: v_mov_b32_e32 v3, s9
+; GFX942-NEXT: v_mov_b32_e32 v4, s10
+; GFX942-NEXT: v_mov_b32_e32 v5, s11
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
store <16 x i32> %a, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
index f2fd3a8fedaaa..c035e9f9415c1 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
@@ -9,9 +9,9 @@
%asm.output = type { <16 x i32>, <8 x i32>, <5 x i32>, <4 x i32>, <16 x i32> }
; CHECK-LABEL: {{^}}illegal_eviction_assert:
-; CHECK: ; def v[4:19] v[20:27] v[0:4] v[0:3] a[0:15]
+; CHECK: ; def v[13:28] v[0:7] v[8:12] v[0:3] a[0:15]
; CHECK: ; clobber
-; CHECK: ; use v[4:19] v[20:27] v[0:4] v[0:3] a[1:16]
+; CHECK: ; use v[13:28] v[0:7] v[8:12] v[0:3] a[1:16]
define void @illegal_eviction_assert(ptr addrspace(1) %arg) #0 {
;%agpr0 = call i32 asm sideeffect "; def $0","=${a0}"()
%asm = call %asm.output asm sideeffect "; def $0 $1 $2 $3 $4","=v,=v,=v,=v,={a[0:15]}"()
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
index e8e122e26e192..bbb9df96f752e 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
@@ -59,19 +59,19 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX90A-LABEL: scalar_to_vector_v8i16:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, s3
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX90A-NEXT: v_mov_b32_e32 v3, s1
+; GFX90A-NEXT: v_mov_b32_e32 v4, s0
+; GFX90A-NEXT: v_mov_b32_e32 v5, s0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX90A-NEXT: s_endpgm
entry:
%val.1.i32 = extractelement <2 x i32> %in, i64 0
@@ -146,19 +146,19 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX90A-LABEL: scalar_to_vector_v8f16:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, s3
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s0
+; GFX90A-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX90A-NEXT: v_mov_b32_e32 v3, s1
+; GFX90A-NEXT: v_mov_b32_e32 v5, s0
+; GFX90A-NEXT: v_mov_b32_e32 v4, s0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX90A-NEXT: s_endpgm
entry:
%val.1.float = extractelement <2 x float> %in, i64 0
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
index 936118750ff10..4d864ad15b411 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
@@ -25,27 +25,27 @@ define void @shufflevector_v2i32_10_physreg_even_vgpr_pair_copy(ptr addrspace(1)
; GFX90A-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"()
@@ -214,27 +214,27 @@ define void @shufflevector_v2i32_11_physreg_even_vgpr_pair_copy(ptr addrspace(1)
; GFX90A-LABEL: shufflevector_v2i32_11_physreg_even_vgpr_pair_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: shufflevector_v2i32_11_physreg_even_vgpr_pair_copy:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v5
+; GFX940-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"()
@@ -265,31 +265,31 @@ define void @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy(ptr addrspace(
; GFX90A-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5, v6, v7
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v10, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v8, v7
+; GFX90A-NEXT: v_mov_b32_e32 v11, v4
+; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v8, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5, v6, v7
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v10, v5
+; GFX940-NEXT: v_mov_b32_e32 v9, v6
+; GFX940-NEXT: v_mov_b32_e32 v8, v7
+; GFX940-NEXT: v_mov_b32_e32 v11, v4
+; GFX940-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"()
@@ -327,31 +327,31 @@ define void @shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy(ptr addrspace(
; GFX90A-LABEL: shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5, v6, v7
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v11, v6
+; GFX90A-NEXT: v_mov_b32_e32 v10, v7
+; GFX90A-NEXT: v_mov_b32_e32 v9, v4
+; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v8, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5, v6, v7
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v3, v6
-; GFX940-NEXT: v_mov_b32_e32 v2, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v8, v5
+; GFX940-NEXT: v_mov_b32_e32 v11, v6
+; GFX940-NEXT: v_mov_b32_e32 v10, v7
+; GFX940-NEXT: v_mov_b32_e32 v9, v4
+; GFX940-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"()
@@ -746,16 +746,15 @@ define i32 @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt(p
; GFX90A-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5, v6, v7
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
-; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v8, v7
+; GFX90A-NEXT: v_mov_b32_e32 v11, v4
+; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -763,17 +762,16 @@ define i32 @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt(p
; GFX940-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v8, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5, v6, v7
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
-; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_mov_b32_e32 v10, v5
+; GFX940-NEXT: v_mov_b32_e32 v9, v6
+; GFX940-NEXT: v_mov_b32_e32 v8, v7
+; GFX940-NEXT: v_mov_b32_e32 v11, v4
+; GFX940-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v0, v6
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 44c88f847f92e..eb0d5465cacd9 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -127,34 +127,26 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
; GFX908: ; %bb.0:
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a0
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a2
-; GFX908-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: v_accvgpr_write_b32 a8, v7
-; GFX908-NEXT: v_accvgpr_write_b32 a2, v8
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
; GFX908-NEXT: v_accvgpr_write_b32 a4, v3
-; GFX908-NEXT: v_accvgpr_write_b32 a7, v6
-; GFX908-NEXT: v_accvgpr_write_b32 a6, v5
-; GFX908-NEXT: v_accvgpr_write_b32 a5, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v1
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v5
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_endpgm
@@ -173,16 +165,16 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v9, a3
-; GFX90A-NEXT: v_accvgpr_read_b32 v8, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a2
; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3
; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v8
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v5
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index d23e314b9465f..f6c357dc38b48 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -70,12 +70,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9]
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1
-; GLOBALNESS1-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane
+; GLOBALNESS1-NEXT: ; implicit-def: $vgpr57 : SGPR spill to VGPR lane
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s8, 0
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s8, 0
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s9, 1
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s9, 1
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[70:71], 1, v3
; GLOBALNESS1-NEXT: v_mov_b32_e32 v46, 0x80
; GLOBALNESS1-NEXT: s_mov_b32 s82, s16
@@ -84,7 +84,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11]
; GLOBALNESS1-NEXT: v_mov_b32_e32 v47, 0
; GLOBALNESS1-NEXT: s_mov_b32 s32, 0
-; GLOBALNESS1-NEXT: ; implicit-def: $vgpr56_vgpr57
+; GLOBALNESS1-NEXT: ; implicit-def: $vgpr58_vgpr59
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -93,24 +93,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 2
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 2
; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 3
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 3
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 4
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 5
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 4
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 5
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 6
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 7
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 6
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 7
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s70, 8
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s71, 9
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s70, 8
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s71, 9
; GLOBALNESS1-NEXT: s_branch .LBB1_4
; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_readlane_b32 s6, v59, 6
-; GLOBALNESS1-NEXT: v_readlane_b32 s7, v59, 7
+; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 6
+; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 7
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_28
; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15
@@ -120,7 +120,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS1-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS1-NEXT: v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_29
; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5
; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1
@@ -128,7 +128,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: flat_load_dword v40, v[46:47]
; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40
; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS1-NEXT: flat_load_dword v58, v[46:47]
+; GLOBALNESS1-NEXT: flat_load_dword v56, v[46:47]
; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0
; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5]
; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
@@ -186,10 +186,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[44:45], off
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s8, 10
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s9, 11
-; GLOBALNESS1-NEXT: v_readlane_b32 s4, v59, 2
-; GLOBALNESS1-NEXT: v_readlane_b32 s5, v59, 3
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s8, 10
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s9, 11
+; GLOBALNESS1-NEXT: v_readlane_b32 s4, v57, 2
+; GLOBALNESS1-NEXT: v_readlane_b32 s5, v57, 3
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13
; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i
@@ -198,7 +198,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56
; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -228,8 +228,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS1-NEXT: ; %bb.19: ; %bb3.i.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT: v_readlane_b32 s4, v59, 0
-; GLOBALNESS1-NEXT: v_readlane_b32 s5, v59, 1
+; GLOBALNESS1-NEXT: v_readlane_b32 s4, v57, 0
+; GLOBALNESS1-NEXT: v_readlane_b32 s5, v57, 1
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS1-NEXT: ; %bb.20: ; %bb6.i.i
@@ -265,7 +265,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b32 s13, s83
; GLOBALNESS1-NEXT: s_mov_b32 s14, s82
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
-; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[56:57], off
+; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[58:59], off
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[54:55]
; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[96:97]
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14
@@ -277,13 +277,13 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: .LBB1_24: ; %Flow23
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[38:39], 0x0
-; GLOBALNESS1-NEXT: v_readlane_b32 s70, v59, 8
-; GLOBALNESS1-NEXT: v_readlane_b32 s8, v59, 10
+; GLOBALNESS1-NEXT: v_readlane_b32 s70, v57, 8
+; GLOBALNESS1-NEXT: v_readlane_b32 s8, v57, 10
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT: v_readlane_b32 s71, v59, 9
+; GLOBALNESS1-NEXT: v_readlane_b32 s71, v57, 9
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS1-NEXT: s_mov_b32 s55, s7
-; GLOBALNESS1-NEXT: v_readlane_b32 s9, v59, 11
+; GLOBALNESS1-NEXT: v_readlane_b32 s9, v57, 11
; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow24
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[52:53]
@@ -291,8 +291,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2
; GLOBALNESS1-NEXT: ; %bb.26: ; %bb67.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_readlane_b32 s6, v59, 4
-; GLOBALNESS1-NEXT: v_readlane_b32 s7, v59, 5
+; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 4
+; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 5
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1
; GLOBALNESS1-NEXT: ; %bb.27: ; %bb69.i
@@ -384,12 +384,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9]
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1
-; GLOBALNESS0-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane
+; GLOBALNESS0-NEXT: ; implicit-def: $vgpr57 : SGPR spill to VGPR lane
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s8, 0
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s8, 0
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s9, 1
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s9, 1
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[84:85], 1, v3
; GLOBALNESS0-NEXT: v_mov_b32_e32 v46, 0x80
; GLOBALNESS0-NEXT: s_mov_b32 s70, s16
@@ -398,7 +398,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11]
; GLOBALNESS0-NEXT: v_mov_b32_e32 v47, 0
; GLOBALNESS0-NEXT: s_mov_b32 s32, 0
-; GLOBALNESS0-NEXT: ; implicit-def: $vgpr56_vgpr57
+; GLOBALNESS0-NEXT: ; implicit-def: $vgpr58_vgpr59
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -407,24 +407,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 2
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 2
; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 3
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 3
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 4
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 5
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 4
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 5
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 6
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 7
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 6
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 7
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s84, 8
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s85, 9
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s84, 8
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s85, 9
; GLOBALNESS0-NEXT: s_branch .LBB1_4
; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_readlane_b32 s6, v59, 6
-; GLOBALNESS0-NEXT: v_readlane_b32 s7, v59, 7
+; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 6
+; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 7
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_28
; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15
@@ -434,7 +434,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS0-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS0-NEXT: v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_29
; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5
; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1
@@ -442,7 +442,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: flat_load_dword v40, v[46:47]
; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40
; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS0-NEXT: flat_load_dword v58, v[46:47]
+; GLOBALNESS0-NEXT: flat_load_dword v56, v[46:47]
; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0
; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5]
; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
@@ -500,10 +500,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[44:45], off
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s8, 10
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s9, 11
-; GLOBALNESS0-NEXT: v_readlane_b32 s4, v59, 2
-; GLOBALNESS0-NEXT: v_readlane_b32 s5, v59, 3
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s8, 10
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s9, 11
+; GLOBALNESS0-NEXT: v_readlane_b32 s4, v57, 2
+; GLOBALNESS0-NEXT: v_readlane_b32 s5, v57, 3
; GLOBALNESS0-NEXT: s_mov_b32 s83, s55
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13
@@ -513,7 +513,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56
; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -543,8 +543,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS0-NEXT: ; %bb.19: ; %bb3.i.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT: v_readlane_b32 s4, v59, 0
-; GLOBALNESS0-NEXT: v_readlane_b32 s5, v59, 1
+; GLOBALNESS0-NEXT: v_readlane_b32 s4, v57, 0
+; GLOBALNESS0-NEXT: v_readlane_b32 s5, v57, 1
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS0-NEXT: ; %bb.20: ; %bb6.i.i
@@ -580,7 +580,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b32 s13, s71
; GLOBALNESS0-NEXT: s_mov_b32 s14, s70
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
-; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[56:57], off
+; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[58:59], off
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[54:55]
; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[96:97]
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14
@@ -591,12 +591,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_branch .LBB1_14
; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow23
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_readlane_b32 s84, v59, 8
-; GLOBALNESS0-NEXT: v_readlane_b32 s8, v59, 10
+; GLOBALNESS0-NEXT: v_readlane_b32 s84, v57, 8
+; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0
; GLOBALNESS0-NEXT: s_mov_b32 s55, s83
-; GLOBALNESS0-NEXT: v_readlane_b32 s85, v59, 9
-; GLOBALNESS0-NEXT: v_readlane_b32 s9, v59, 11
+; GLOBALNESS0-NEXT: v_readlane_b32 s85, v57, 9
+; GLOBALNESS0-NEXT: v_readlane_b32 s9, v57, 11
; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow24
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[52:53]
@@ -604,8 +604,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2
; GLOBALNESS0-NEXT: ; %bb.26: ; %bb67.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_readlane_b32 s6, v59, 4
-; GLOBALNESS0-NEXT: v_readlane_b32 s7, v59, 5
+; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 4
+; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 5
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1
; GLOBALNESS0-NEXT: ; %bb.27: ; %bb69.i
diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
index d0d1ba82dc000..b3166fa3f4548 100644
--- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
+++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
@@ -8,9 +8,8 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; CHECK-NEXT: v_mov_b32_e32 v40, v0
-; CHECK-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; CHECK-NEXT: flat_load_dword v42, v[0:1]
+; CHECK-NEXT: v_pk_mov_b32 v[44:45], 0, 0
+; CHECK-NEXT: flat_load_dword v42, v[44:45]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x8
@@ -19,48 +18,44 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v46, s6
-; CHECK-NEXT: v_mov_b32_e32 v47, s7
+; CHECK-NEXT: v_accvgpr_write_b32 a32, s6
+; CHECK-NEXT: v_accvgpr_write_b32 a33, s7
; CHECK-NEXT: s_mov_b64 s[6:7], src_private_base
; CHECK-NEXT: s_cmp_lg_u32 s64, -1
; CHECK-NEXT: s_cselect_b32 s7, s7, 0
; CHECK-NEXT: s_cselect_b32 s8, s64, 0
; CHECK-NEXT: s_add_u32 s50, s34, 48
; CHECK-NEXT: s_addc_u32 s51, s35, 0
-; CHECK-NEXT: v_pk_mov_b32 v[58:59], s[4:5], s[4:5] op_sel:[0,1]
+; CHECK-NEXT: v_pk_mov_b32 v[56:57], s[4:5], s[4:5] op_sel:[0,1]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, G at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, G at gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0
; CHECK-NEXT: s_mov_b32 s6, 0
-; CHECK-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; CHECK-NEXT: v_mov_b32_e32 v57, s7
+; CHECK-NEXT: v_mov_b32_e32 v47, s7
; CHECK-NEXT: s_mov_b32 s7, s6
; CHECK-NEXT: s_mov_b32 s53, s14
-; CHECK-NEXT: v_accvgpr_write_b32 a33, v1
-; CHECK-NEXT: v_mov_b32_e32 v56, s8
-; CHECK-NEXT: v_pk_mov_b32 v[60:61], s[6:7], s[6:7] op_sel:[0,1]
+; CHECK-NEXT: v_mov_b32_e32 v46, s8
+; CHECK-NEXT: v_pk_mov_b32 v[58:59], s[6:7], s[6:7] op_sel:[0,1]
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51]
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s14, s16
-; CHECK-NEXT: v_mov_b32_e32 v31, v40
+; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_mov_b32 s33, s16
; CHECK-NEXT: s_mov_b32 s52, s15
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
-; CHECK-NEXT: v_accvgpr_write_b32 a32, v0
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61]
+; CHECK-NEXT: v_mov_b32_e32 v40, v0
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[58:59]
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
-; CHECK-NEXT: flat_load_dwordx2 v[62:63], v[58:59]
-; CHECK-NEXT: v_accvgpr_read_b32 v0, a32
-; CHECK-NEXT: v_mov_b32_e32 v44, 0
-; CHECK-NEXT: v_mov_b32_e32 v45, 0x3ff00000
-; CHECK-NEXT: v_accvgpr_read_b32 v1, a33
+; CHECK-NEXT: flat_load_dwordx2 v[60:61], v[56:57]
+; CHECK-NEXT: v_mov_b32_e32 v62, 0
+; CHECK-NEXT: v_mov_b32_e32 v63, 0x3ff00000
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51]
@@ -69,20 +64,20 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_mov_b32 s13, s52
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_mov_b32_e32 v31, v40
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[44:45]
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61]
+; CHECK-NEXT: flat_store_dwordx2 v[44:45], v[62:63]
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[58:59]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
-; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[56:57] glc
+; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[46:47] glc
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s64
; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, 0, v42
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63]
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[60:61]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[46:47]
-; CHECK-NEXT: buffer_store_dword v47, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], a[32:33]
+; CHECK-NEXT: buffer_store_dword a33, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v62, v0, s[0:3], 0 offen
; CHECK-NEXT: ; implicit-def: $vgpr4
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 2f25a93899721..fe7def8a69278 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1961,16 +1961,15 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1)
; GFX942-LABEL: shuffle_v6f16_452367:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: global_load_dwordx3 v[0:2], v[6:7], off
-; GFX942-NEXT: global_load_dword v3, v[4:5], off
+; GFX942-NEXT: global_load_dwordx3 v[4:6], v[0:1], off
+; GFX942-NEXT: global_load_dword v4, v[2:3], off
+; GFX942-NEXT: ; kill: killed $vgpr0 killed $vgpr1
+; GFX942-NEXT: ; kill: killed $vgpr2 killed $vgpr3
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v6f16_452367:
@@ -5151,16 +5150,15 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace
; GFX942-LABEL: shuffle_v6bf16_452367:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: global_load_dwordx3 v[0:2], v[6:7], off
-; GFX942-NEXT: global_load_dword v3, v[4:5], off
+; GFX942-NEXT: global_load_dwordx3 v[4:6], v[0:1], off
+; GFX942-NEXT: global_load_dword v4, v[2:3], off
+; GFX942-NEXT: ; kill: killed $vgpr0 killed $vgpr1
+; GFX942-NEXT: ; kill: killed $vgpr2 killed $vgpr3
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v6bf16_452367:
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index a401f989a2507..d8264b5a091e1 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -58,19 +58,19 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 2, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v3
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX942-NEXT: global_load_dword v2, v1, s[0:1]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB1_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dword v1, v2, s[2:3]
+; GFX942-NEXT: global_load_dword v2, v1, s[2:3]
; GFX942-NEXT: .LBB1_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX942-NEXT: global_store_dword v0, v2, s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -136,19 +136,19 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[0:1]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB3_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3]
; GFX942-NEXT: .LBB3_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -173,19 +173,19 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_and_b32_e32 v6, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, 4, v6
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 4, v6
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1]
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v6
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB4_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3]
; GFX942-NEXT: .LBB4_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -210,23 +210,23 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_and_b32_e32 v10, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v9, 5, v10
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 5, v10
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v9, s[0:1] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v9, s[0:1]
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v10
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB5_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v9, s[2:3] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v9, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[2:3] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3]
; GFX942-NEXT: .LBB5_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] offset:16
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -250,72 +250,72 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v2
+; GFX942-NEXT: v_and_b32_e32 v62, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v62
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[28:31], v1, s[0:1] offset:240
-; GFX942-NEXT: global_load_dwordx4 v[24:27], v1, s[0:1] offset:224
-; GFX942-NEXT: global_load_dwordx4 v[20:23], v1, s[0:1] offset:208
-; GFX942-NEXT: global_load_dwordx4 v[16:19], v1, s[0:1] offset:192
-; GFX942-NEXT: global_load_dwordx4 v[12:15], v1, s[0:1] offset:176
-; GFX942-NEXT: global_load_dwordx4 v[8:11], v1, s[0:1] offset:160
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v1, s[0:1] offset:144
-; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[0:1] offset:128
-; GFX942-NEXT: global_load_dwordx4 v[60:63], v1, s[0:1] offset:112
-; GFX942-NEXT: global_load_dwordx4 v[56:59], v1, s[0:1] offset:96
-; GFX942-NEXT: global_load_dwordx4 v[52:55], v1, s[0:1] offset:80
-; GFX942-NEXT: global_load_dwordx4 v[48:51], v1, s[0:1] offset:64
-; GFX942-NEXT: global_load_dwordx4 v[44:47], v1, s[0:1] offset:48
-; GFX942-NEXT: global_load_dwordx4 v[40:43], v1, s[0:1] offset:32
-; GFX942-NEXT: global_load_dwordx4 v[36:39], v1, s[0:1] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[32:35], v1, s[0:1]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2
+; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[0:1] offset:240
+; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[0:1] offset:224
+; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[0:1] offset:208
+; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[0:1] offset:192
+; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[0:1] offset:176
+; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[0:1] offset:160
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1] offset:128
+; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[0:1] offset:112
+; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[0:1] offset:96
+; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[0:1] offset:80
+; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[0:1] offset:64
+; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[0:1] offset:48
+; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[0:1] offset:32
+; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[0:1] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[0:1]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v62
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB6_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx4 v[28:31], v1, s[2:3] offset:240
-; GFX942-NEXT: global_load_dwordx4 v[24:27], v1, s[2:3] offset:224
-; GFX942-NEXT: global_load_dwordx4 v[20:23], v1, s[2:3] offset:208
-; GFX942-NEXT: global_load_dwordx4 v[16:19], v1, s[2:3] offset:192
-; GFX942-NEXT: global_load_dwordx4 v[12:15], v1, s[2:3] offset:176
-; GFX942-NEXT: global_load_dwordx4 v[8:11], v1, s[2:3] offset:160
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3] offset:144
-; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[2:3] offset:128
-; GFX942-NEXT: global_load_dwordx4 v[60:63], v1, s[2:3] offset:112
-; GFX942-NEXT: global_load_dwordx4 v[56:59], v1, s[2:3] offset:96
-; GFX942-NEXT: global_load_dwordx4 v[52:55], v1, s[2:3] offset:80
-; GFX942-NEXT: global_load_dwordx4 v[48:51], v1, s[2:3] offset:64
-; GFX942-NEXT: global_load_dwordx4 v[44:47], v1, s[2:3] offset:48
-; GFX942-NEXT: global_load_dwordx4 v[40:43], v1, s[2:3] offset:32
-; GFX942-NEXT: global_load_dwordx4 v[36:39], v1, s[2:3] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[32:35], v1, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[2:3] offset:240
+; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[2:3] offset:224
+; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[2:3] offset:208
+; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[2:3] offset:192
+; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[2:3] offset:176
+; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[2:3] offset:160
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[2:3] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3] offset:128
+; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[2:3] offset:112
+; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[2:3] offset:96
+; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[2:3] offset:80
+; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[2:3] offset:64
+; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[2:3] offset:48
+; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[2:3] offset:32
+; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[2:3] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[2:3]
; GFX942-NEXT: .LBB6_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[60:63], s[6:7] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] offset:112
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[56:59], s[6:7] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, v[58:61], s[6:7] offset:96
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[52:55], s[6:7] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, v[54:57], s[6:7] offset:80
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[48:51], s[6:7] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, v[50:53], s[6:7] offset:64
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[44:47], s[6:7] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, v[46:49], s[6:7] offset:48
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[40:43], s[6:7] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, v[42:45], s[6:7] offset:32
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[36:39], s[6:7] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, v[38:41], s[6:7] offset:16
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[32:35], s[6:7]
-; GFX942-NEXT: global_store_dwordx4 v0, v[28:31], s[6:7] offset:240
-; GFX942-NEXT: global_store_dwordx4 v0, v[24:27], s[6:7] offset:224
-; GFX942-NEXT: global_store_dwordx4 v0, v[20:23], s[6:7] offset:208
-; GFX942-NEXT: global_store_dwordx4 v0, v[16:19], s[6:7] offset:192
-; GFX942-NEXT: global_store_dwordx4 v0, v[12:15], s[6:7] offset:176
-; GFX942-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7] offset:160
-; GFX942-NEXT: global_store_dwordx4 v0, v[4:7], s[6:7] offset:144
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] offset:128
+; GFX942-NEXT: global_store_dwordx4 v0, v[34:37], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v0, v[30:33], s[6:7] offset:240
+; GFX942-NEXT: global_store_dwordx4 v0, v[26:29], s[6:7] offset:224
+; GFX942-NEXT: global_store_dwordx4 v0, v[22:25], s[6:7] offset:208
+; GFX942-NEXT: global_store_dwordx4 v0, v[18:21], s[6:7] offset:192
+; GFX942-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7] offset:176
+; GFX942-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] offset:160
+; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] offset:144
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7] offset:128
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -391,17 +391,17 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-LABEL: v8i8_phi_chain:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v2
-; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v2
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB8_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v2
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[10:11]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
@@ -410,14 +410,14 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX942-NEXT: s_cbranch_execz .LBB8_4
; GFX942-NEXT: ; %bb.3: ; %bb.2
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[12:13]
; GFX942-NEXT: .LBB8_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -447,38 +447,38 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa
; GFX942-LABEL: v8i8_phi_zeroinit:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, 3, v4
-; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v4
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v5, s[8:9]
-; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9]
+; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB9_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v4
+; GFX942-NEXT: global_load_dwordx2 v[4:5], v1, s[10:11]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX942-NEXT: .LBB9_2: ; %Flow
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX942-NEXT: s_cbranch_execz .LBB9_4
; GFX942-NEXT: ; %bb.3: ; %bb.2
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13]
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[12:13]
; GFX942-NEXT: .LBB9_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[14:15]
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_store_dwordx2 v0, v[4:5], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -617,30 +617,30 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
; GFX942-LABEL: v8i8_multi_block:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v5, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v6, 3, v5
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v5
+; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v3
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v6, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB11_4
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v6, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v5
+; GFX942-NEXT: global_load_dwordx2 v[6:7], v4, s[10:11]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v3
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB11_3
; GFX942-NEXT: ; %bb.2: ; %bb.2
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[12:13]
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[12:13]
; GFX942-NEXT: .LBB11_3: ; %Flow
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: .LBB11_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[14:15]
+; GFX942-NEXT: global_store_dwordx2 v2, v[6:7], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -859,15 +859,15 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB14_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[10:11]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[10:11]
; GFX942-NEXT: .LBB14_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[14:15], 0x0
@@ -878,9 +878,9 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 6
-; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[12:13]
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[12:13]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -909,15 +909,15 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[36:43], s[4:5], 0x24
; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[36:37]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[36:37]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB15_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[38:39]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[38:39]
; GFX942-NEXT: .LBB15_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_load_dwordx16 s[16:31], s[42:43], 0x0
@@ -957,18 +957,18 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v2, a[28:31], s[40:41] offset:112
-; GFX942-NEXT: global_store_dwordx4 v2, a[24:27], s[40:41] offset:96
-; GFX942-NEXT: global_store_dwordx4 v2, a[20:23], s[40:41] offset:80
-; GFX942-NEXT: global_store_dwordx4 v2, a[16:19], s[40:41] offset:64
-; GFX942-NEXT: global_store_dwordx4 v2, a[12:15], s[40:41] offset:48
-; GFX942-NEXT: global_store_dwordx4 v2, a[8:11], s[40:41] offset:32
-; GFX942-NEXT: global_store_dwordx4 v2, a[4:7], s[40:41] offset:16
-; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[40:41]
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[40:41] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[40:41] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[40:41] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[40:41] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[40:41] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[40:41] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[40:41] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[40:41]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list