[llvm] [AMDGPU] Inflate to %AV regclass (PR #147413)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 8 16:00:07 PDT 2025
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/147413
>From 3f6ecafffdce0cf8d939d86210a7f22c5b135ff7 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 15 Jul 2025 15:10:41 -0700
Subject: [PATCH 1/4] [AMDGPU] Inflate to %av regclass
Change-Id: Ied8fe81cf2c8271ca22eedbade4eb312f3fbea39
---
.../Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp | 16 +
llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll | 357 ++++++++++++++++++
2 files changed, 373 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
index 0137b3f5943d7..427922481ecca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
@@ -97,6 +97,8 @@ bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) {
const MCInstrDesc &AVImmPseudo32 = TII.get(AMDGPU::AV_MOV_B32_IMM_PSEUDO);
const MCInstrDesc &AVImmPseudo64 = TII.get(AMDGPU::AV_MOV_B64_IMM_PSEUDO);
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
@@ -119,6 +121,20 @@ bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) {
Changed = true;
continue;
}
+
+ for (MachineOperand &Op : MI.operands()) {
+ if (!Op.isReg() || !Op.isDef())
+ continue;
+
+ Register DefReg = Op.getReg();
+ if (DefReg.isPhysical())
+ continue;
+
+ const TargetRegisterClass *RC = MRI.getRegClass(DefReg);
+
+ if (TRI->isAGPRClass(RC) || TRI->isVGPRClass(RC))
+ Changed |= MRI.recomputeRegClass(DefReg);
+ }
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll b/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll
new file mode 100644
index 0000000000000..3a534149121fb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll
@@ -0,0 +1,357 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 --greedy-regclass-priority-trumps-globalness=1 < %s | FileCheck %s
+
+define amdgpu_kernel void @bad_rp(ptr addrspace(3) %in0, ptr addrspace(0) %out, i1 %cond) #0 {
+; CHECK-LABEL: bad_rp:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
+; CHECK-NEXT: s_load_dword s1, s[4:5], 0x10
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ds_read_b128 a[0:3], v0
+; CHECK-NEXT: ds_read_b128 a[4:7], v0 offset:16
+; CHECK-NEXT: ds_read_b128 a[8:11], v0 offset:32
+; CHECK-NEXT: ds_read_b128 a[12:15], v0 offset:48
+; CHECK-NEXT: ds_read_b128 a[16:19], v0 offset:64
+; CHECK-NEXT: ds_read_b128 a[20:23], v0 offset:80
+; CHECK-NEXT: ds_read_b128 a[24:27], v0 offset:96
+; CHECK-NEXT: ds_read_b128 a[28:31], v0 offset:112
+; CHECK-NEXT: ds_read_b128 a[32:35], v0 offset:128
+; CHECK-NEXT: ds_read_b128 a[36:39], v0 offset:144
+; CHECK-NEXT: ds_read_b128 a[40:43], v0 offset:160
+; CHECK-NEXT: ds_read_b128 a[44:47], v0 offset:176
+; CHECK-NEXT: ds_read_b128 a[48:51], v0 offset:192
+; CHECK-NEXT: ds_read_b128 a[52:55], v0 offset:208
+; CHECK-NEXT: ds_read_b128 a[56:59], v0 offset:224
+; CHECK-NEXT: ds_read_b128 a[60:63], v0 offset:240
+; CHECK-NEXT: s_bitcmp1_b32 s1, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], -1
+; CHECK-NEXT: .LBB0_1: ; %bb.1
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt lgkmcnt(14)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[240:255], a[0:3], a[0:3], 0
+; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1]
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[224:239], a[4:7], a[4:7], v[240:255]
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[208:223], a[8:11], a[8:11], v[224:239]
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[192:207], a[12:15], a[12:15], v[208:223]
+; CHECK-NEXT: s_waitcnt lgkmcnt(11)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[176:191], a[16:19], a[16:19], v[192:207]
+; CHECK-NEXT: s_waitcnt lgkmcnt(10)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], a[20:23], a[20:23], v[176:191]
+; CHECK-NEXT: s_waitcnt lgkmcnt(9)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], a[24:27], a[24:27], v[160:175]
+; CHECK-NEXT: s_waitcnt lgkmcnt(8)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], a[28:31], a[28:31], v[144:159]
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], a[32:35], a[32:35], v[128:143]
+; CHECK-NEXT: s_waitcnt lgkmcnt(6)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], a[36:39], a[36:39], v[112:127]
+; CHECK-NEXT: s_waitcnt lgkmcnt(5)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], a[40:43], a[40:43], v[96:111]
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], a[44:47], a[44:47], v[80:95]
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], a[48:51], a[48:51], v[64:79]
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[32:47], a[52:55], a[52:55], v[48:63]
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], a[56:59], a[56:59], v[32:47]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], a[60:63], a[60:63], v[16:31]
+; CHECK-NEXT: s_cbranch_vccnz .LBB0_1
+; CHECK-NEXT: ; %bb.2: ; %bb.2
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b64_e32 v[168:169], s[0:1]
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[244:247] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[240:243]
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[224:227] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[228:231] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[208:211] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[212:215] offset:80
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[192:195] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[196:199] offset:112
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[180:183] offset:144
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[176:179] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[160:163] offset:160
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[164:167] offset:176
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[148:151] offset:208
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[156:159] offset:240
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[152:155] offset:224
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[144:147] offset:192
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[140:143] offset:272
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[136:139] offset:256
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[132:135] offset:240
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[128:131] offset:224
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[124:127] offset:304
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[120:123] offset:288
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[116:119] offset:272
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[112:115] offset:256
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[108:111] offset:336
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[104:107] offset:320
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[100:103] offset:304
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[96:99] offset:288
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[92:95] offset:368
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[88:91] offset:352
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[84:87] offset:336
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[80:83] offset:320
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[76:79] offset:400
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[72:75] offset:384
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[68:71] offset:368
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[64:67] offset:352
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[60:63] offset:432
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[56:59] offset:416
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[52:55] offset:400
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[48:51] offset:384
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[44:47] offset:464
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[40:43] offset:448
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[36:39] offset:432
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[32:35] offset:416
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[28:31] offset:496
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[24:27] offset:480
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[20:23] offset:464
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[16:19] offset:448
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[12:15] offset:528
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[8:11] offset:512
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[4:7] offset:496
+; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[0:3] offset:480
+; CHECK-NEXT: s_endpgm
+ %gep1 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 4
+ %gep2 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 8
+ %gep3 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 12
+ %gep4 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 16
+ %gep5 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 20
+ %gep6 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 24
+ %gep7 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 28
+ %gep8 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 32
+ %gep9 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 36
+ %gep10 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 40
+ %gep11 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 44
+ %gep12 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 48
+ %gep13 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 52
+ %gep14 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 56
+ %gep15 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 60
+ %load0 = load <8 x half>, ptr addrspace(3) %in0, align 16
+ %load1 = load <8 x half>, ptr addrspace(3) %gep1, align 16
+ %load2 = load <8 x half>, ptr addrspace(3) %gep2, align 16
+ %load3 = load <8 x half>, ptr addrspace(3) %gep3, align 16
+ %load4 = load <8 x half>, ptr addrspace(3) %gep4, align 16
+ %load5 = load <8 x half>, ptr addrspace(3) %gep5, align 16
+ %load6 = load <8 x half>, ptr addrspace(3) %gep6, align 16
+ %load7 = load <8 x half>, ptr addrspace(3) %gep7, align 16
+ %load8 = load <8 x half>, ptr addrspace(3) %gep8, align 16
+ %load9 = load <8 x half>, ptr addrspace(3) %gep9, align 16
+ %load10 = load <8 x half>, ptr addrspace(3) %gep10, align 16
+ %load11 = load <8 x half>, ptr addrspace(3) %gep11, align 16
+ %load12 = load <8 x half>, ptr addrspace(3) %gep12, align 16
+ %load13 = load <8 x half>, ptr addrspace(3) %gep13, align 16
+ %load14 = load <8 x half>, ptr addrspace(3) %gep14, align 16
+ %load15 = load <8 x half>, ptr addrspace(3) %gep15, align 16
+ br label %bb.1
+
+bb.1:
+ %mfma0 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load0, <8 x half> %load0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+ %mfma1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load1, <8 x half> %load1, <16 x float> %mfma0, i32 0, i32 0, i32 0)
+ %mfma2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load2, <8 x half> %load2, <16 x float> %mfma1, i32 0, i32 0, i32 0)
+ %mfma3 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load3, <8 x half> %load3, <16 x float> %mfma2, i32 0, i32 0, i32 0)
+ %mfma4 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load4, <8 x half> %load4, <16 x float> %mfma3, i32 0, i32 0, i32 0)
+ %mfma5 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load5, <8 x half> %load5, <16 x float> %mfma4, i32 0, i32 0, i32 0)
+ %mfma6 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load6, <8 x half> %load6, <16 x float> %mfma5, i32 0, i32 0, i32 0)
+ %mfma7 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load7, <8 x half> %load7, <16 x float> %mfma6, i32 0, i32 0, i32 0)
+ %mfma8 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load8, <8 x half> %load8, <16 x float> %mfma7, i32 0, i32 0, i32 0)
+ %mfma9 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load9, <8 x half> %load9, <16 x float> %mfma8, i32 0, i32 0, i32 0)
+ %mfma10 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load10, <8 x half> %load10, <16 x float> %mfma9, i32 0, i32 0, i32 0)
+ %mfma11 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load11, <8 x half> %load11, <16 x float> %mfma10, i32 0, i32 0, i32 0)
+ %mfma12 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load12, <8 x half> %load12, <16 x float> %mfma11, i32 0, i32 0, i32 0)
+ %mfma13 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load13, <8 x half> %load13, <16 x float> %mfma12, i32 0, i32 0, i32 0)
+ %mfma14 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load14, <8 x half> %load14, <16 x float> %mfma13, i32 0, i32 0, i32 0)
+ %mfma15 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load15, <8 x half> %load15, <16 x float> %mfma14, i32 0, i32 0, i32 0)
+ br i1 %cond, label %bb.1, label %bb.2
+
+bb.2:
+ %out1 = getelementptr ptr, ptr %out, i32 4
+ %out2 = getelementptr ptr, ptr %out, i32 8
+ %out3 = getelementptr ptr, ptr %out, i32 12
+ %out4 = getelementptr ptr, ptr %out, i32 16
+ %out5 = getelementptr ptr, ptr %out, i32 20
+ %out6 = getelementptr ptr, ptr %out, i32 24
+ %out7 = getelementptr ptr, ptr %out, i32 28
+ %out8 = getelementptr ptr, ptr %out, i32 32
+ %out9 = getelementptr ptr, ptr %out, i32 36
+ %out10 = getelementptr ptr, ptr %out, i32 40
+ %out11 = getelementptr ptr, ptr %out, i32 44
+ %out12 = getelementptr ptr, ptr %out, i32 48
+ %out13 = getelementptr ptr, ptr %out, i32 52
+ %out14 = getelementptr ptr, ptr %out, i32 56
+ %out15 = getelementptr ptr, ptr %out, i32 60
+ store <16 x float> %mfma0, ptr addrspace(0) %out
+ store <16 x float> %mfma1, ptr addrspace(0) %out1
+ store <16 x float> %mfma2, ptr addrspace(0) %out2
+ store <16 x float> %mfma3, ptr addrspace(0) %out3
+ store <16 x float> %mfma4, ptr addrspace(0) %out4
+ store <16 x float> %mfma5, ptr addrspace(0) %out5
+ store <16 x float> %mfma6, ptr addrspace(0) %out6
+ store <16 x float> %mfma7, ptr addrspace(0) %out7
+ store <16 x float> %mfma8, ptr addrspace(0) %out8
+ store <16 x float> %mfma9, ptr addrspace(0) %out9
+ store <16 x float> %mfma10, ptr addrspace(0) %out10
+ store <16 x float> %mfma11, ptr addrspace(0) %out11
+ store <16 x float> %mfma12, ptr addrspace(0) %out12
+ store <16 x float> %mfma13, ptr addrspace(0) %out13
+ store <16 x float> %mfma14, ptr addrspace(0) %out14
+ store <16 x float> %mfma15, ptr addrspace(0) %out15
+ ret void
+}
+
+define amdgpu_kernel void @good_rp(ptr addrspace(3) %in0, ptr addrspace(0) %out, i1 %cond) #0 {
+; CHECK-LABEL: good_rp:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s0, s[4:5], 0x10
+; CHECK-NEXT: s_load_dword s1, s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_bitcmp1_b32 s0, 0
+; CHECK-NEXT: v_mov_b32_e32 v0, s1
+; CHECK-NEXT: ds_read_b128 v[176:179], v0
+; CHECK-NEXT: ds_read_b128 v[180:183], v0 offset:16
+; CHECK-NEXT: ds_read_b128 v[184:187], v0 offset:32
+; CHECK-NEXT: ds_read_b128 v[188:191], v0 offset:48
+; CHECK-NEXT: ds_read_b128 v[192:195], v0 offset:64
+; CHECK-NEXT: ds_read_b128 v[196:199], v0 offset:80
+; CHECK-NEXT: ds_read_b128 v[200:203], v0 offset:96
+; CHECK-NEXT: ds_read_b128 v[204:207], v0 offset:112
+; CHECK-NEXT: ds_read_b128 v[208:211], v0 offset:128
+; CHECK-NEXT: ds_read_b128 v[212:215], v0 offset:144
+; CHECK-NEXT: ds_read_b128 v[216:219], v0 offset:160
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], -1
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
+; CHECK-NEXT: .LBB1_1: ; %bb.1
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt lgkmcnt(10)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], v[176:179], v[176:179], 0
+; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT: s_waitcnt lgkmcnt(9)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], v[180:183], v[180:183], v[160:175]
+; CHECK-NEXT: s_waitcnt lgkmcnt(8)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], v[184:187], v[184:187], v[144:159]
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], v[188:191], v[188:191], v[128:143]
+; CHECK-NEXT: s_waitcnt lgkmcnt(6)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], v[192:195], v[192:195], v[112:127]
+; CHECK-NEXT: s_waitcnt lgkmcnt(5)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], v[196:199], v[196:199], v[96:111]
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], v[200:203], v[200:203], v[80:95]
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], v[204:207], v[204:207], v[64:79]
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[32:47], v[208:211], v[208:211], v[48:63]
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[212:215], v[212:215], v[32:47]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[216:219], v[216:219], v[16:31]
+; CHECK-NEXT: s_cbranch_vccnz .LBB1_1
+; CHECK-NEXT: ; %bb.2: ; %bb.2
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b64_e32 v[88:89], s[0:1]
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[164:167] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[160:163]
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[144:147] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[148:151] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[128:131] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[132:135] offset:80
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[112:115] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[116:119] offset:112
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[100:103] offset:144
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[96:99] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[80:83] offset:160
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[84:87] offset:176
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[68:71] offset:208
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[76:79] offset:240
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[72:75] offset:224
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[64:67] offset:192
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[60:63] offset:272
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[56:59] offset:256
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[52:55] offset:240
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[48:51] offset:224
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[44:47] offset:304
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[40:43] offset:288
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[36:39] offset:272
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[32:35] offset:256
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[28:31] offset:336
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[24:27] offset:320
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[20:23] offset:304
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[16:19] offset:288
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[12:15] offset:368
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[8:11] offset:352
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[4:7] offset:336
+; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[0:3] offset:320
+; CHECK-NEXT: s_endpgm
+ %gep1 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 4
+ %gep2 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 8
+ %gep3 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 12
+ %gep4 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 16
+ %gep5 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 20
+ %gep6 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 24
+ %gep7 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 28
+ %gep8 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 32
+ %gep9 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 36
+ %gep10 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 40
+ %load0 = load <8 x half>, ptr addrspace(3) %in0, align 16
+ %load1 = load <8 x half>, ptr addrspace(3) %gep1, align 16
+ %load2 = load <8 x half>, ptr addrspace(3) %gep2, align 16
+ %load3 = load <8 x half>, ptr addrspace(3) %gep3, align 16
+ %load4 = load <8 x half>, ptr addrspace(3) %gep4, align 16
+ %load5 = load <8 x half>, ptr addrspace(3) %gep5, align 16
+ %load6 = load <8 x half>, ptr addrspace(3) %gep6, align 16
+ %load7 = load <8 x half>, ptr addrspace(3) %gep7, align 16
+ %load8 = load <8 x half>, ptr addrspace(3) %gep8, align 16
+ %load9 = load <8 x half>, ptr addrspace(3) %gep9, align 16
+ %load10 = load <8 x half>, ptr addrspace(3) %gep10, align 16
+ br label %bb.1
+
+bb.1:
+ %mfma0 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load0, <8 x half> %load0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+ %mfma1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load1, <8 x half> %load1, <16 x float> %mfma0, i32 0, i32 0, i32 0)
+ %mfma2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load2, <8 x half> %load2, <16 x float> %mfma1, i32 0, i32 0, i32 0)
+ %mfma3 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load3, <8 x half> %load3, <16 x float> %mfma2, i32 0, i32 0, i32 0)
+ %mfma4 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load4, <8 x half> %load4, <16 x float> %mfma3, i32 0, i32 0, i32 0)
+ %mfma5 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load5, <8 x half> %load5, <16 x float> %mfma4, i32 0, i32 0, i32 0)
+ %mfma6 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load6, <8 x half> %load6, <16 x float> %mfma5, i32 0, i32 0, i32 0)
+ %mfma7 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load7, <8 x half> %load7, <16 x float> %mfma6, i32 0, i32 0, i32 0)
+ %mfma8 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load8, <8 x half> %load8, <16 x float> %mfma7, i32 0, i32 0, i32 0)
+ %mfma9 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load9, <8 x half> %load9, <16 x float> %mfma8, i32 0, i32 0, i32 0)
+ %mfma10 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load10, <8 x half> %load10, <16 x float> %mfma9, i32 0, i32 0, i32 0)
+ br i1 %cond, label %bb.1, label %bb.2
+
+bb.2:
+ %out1 = getelementptr ptr, ptr %out, i32 4
+ %out2 = getelementptr ptr, ptr %out, i32 8
+ %out3 = getelementptr ptr, ptr %out, i32 12
+ %out4 = getelementptr ptr, ptr %out, i32 16
+ %out5 = getelementptr ptr, ptr %out, i32 20
+ %out6 = getelementptr ptr, ptr %out, i32 24
+ %out7 = getelementptr ptr, ptr %out, i32 28
+ %out8 = getelementptr ptr, ptr %out, i32 32
+ %out9 = getelementptr ptr, ptr %out, i32 36
+ %out10 = getelementptr ptr, ptr %out, i32 40
+ store <16 x float> %mfma0, ptr addrspace(0) %out
+ store <16 x float> %mfma1, ptr addrspace(0) %out1
+ store <16 x float> %mfma2, ptr addrspace(0) %out2
+ store <16 x float> %mfma3, ptr addrspace(0) %out3
+ store <16 x float> %mfma4, ptr addrspace(0) %out4
+ store <16 x float> %mfma5, ptr addrspace(0) %out5
+ store <16 x float> %mfma6, ptr addrspace(0) %out6
+ store <16 x float> %mfma7, ptr addrspace(0) %out7
+ store <16 x float> %mfma8, ptr addrspace(0) %out8
+ store <16 x float> %mfma9, ptr addrspace(0) %out9
+ store <16 x float> %mfma10, ptr addrspace(0) %out10
+ ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1" }
>From 6d5273761c2659ecaf8f453f8c9def032aed145e Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 25 Sep 2025 17:25:04 -0700
Subject: [PATCH 2/4] Update lit tests
Change-Id: I5a6da22ff34debbc677973453d038c86d32d0ad0
---
.../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 144 +-
llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll | 406 +-
.../CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll | 81 +-
.../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll | 3678 +++---
.../AMDGPU/a-v-global-atomic-cmpxchg.ll | 28 +-
.../CodeGen/AMDGPU/a-v-global-atomicrmw.ll | 1070 +-
.../AMDGPU/agpr-copy-no-free-registers.ll | 12 +-
llvm/test/CodeGen/AMDGPU/agpr-csr.ll | 680 +-
llvm/test/CodeGen/AMDGPU/agpr-remat.ll | 16 +-
.../buffer-fat-pointer-atomicrmw-fadd.ll | 42 +-
.../buffer-fat-pointer-atomicrmw-fmax.ll | 30 +-
.../buffer-fat-pointer-atomicrmw-fmin.ll | 30 +-
.../AMDGPU/buffer-fat-pointers-memcpy.ll | 307 +-
llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll | 422 +-
.../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 4 -
.../CodeGen/AMDGPU/global-i16-load-store.ll | 12 +-
.../AMDGPU/illegal-sgpr-to-vgpr-copy.ll | 7 +-
.../AMDGPU/lds-dma-workgroup-release.ll | 24 +-
.../AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll | 8 +-
.../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 12 +-
.../llvm.amdgcn.image.atomic.dim.gfx90a.ll | 34 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll | 96 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll | 12 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 80 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll | 476 +-
.../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll | 146 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 1236 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll | 41 +-
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 1496 +--
....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 3688 ++++--
...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 10697 ++++++++++------
.../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 2436 ++--
...gcn.struct.ptr.buffer.atomic.fadd_nortn.ll | 4 -
...mdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll | 16 +-
...uffer-fat-pointers-nontemporal-metadata.ll | 24 +-
llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll | 2000 ++-
llvm/test/CodeGen/AMDGPU/mfma-loop.ll | 1373 +-
.../AMDGPU/mfma-no-register-aliasing.ll | 604 +-
.../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll | 51 +-
.../CodeGen/AMDGPU/no-fold-accvgpr-mov.mir | 26 +-
.../CodeGen/AMDGPU/no-fold-accvgpr-read.mir | 4 +-
.../AMDGPU/preserve-wwm-copy-dst-reg.ll | 2 +-
.../AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll | 2 -
.../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll | 119 +-
.../AMDGPU/shufflevector-physreg-copy.ll | 12 +-
.../AMDGPU/shufflevector.v2f32.v3f32.ll | 28 +-
.../AMDGPU/shufflevector.v2i32.v3i32.ll | 28 +-
.../AMDGPU/shufflevector.v2i64.v2i64.ll | 40 +-
.../CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll | 40 +-
.../CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll | 28 +-
.../AMDGPU/shufflevector.v3f32.v2f32.ll | 391 +-
.../AMDGPU/shufflevector.v3f32.v3f32.ll | 200 +-
.../AMDGPU/shufflevector.v3f32.v4f32.ll | 377 +-
.../AMDGPU/shufflevector.v3i32.v2i32.ll | 391 +-
.../AMDGPU/shufflevector.v3i32.v3i32.ll | 200 +-
.../AMDGPU/shufflevector.v3i32.v4i32.ll | 377 +-
.../AMDGPU/shufflevector.v3i64.v2i64.ll | 92 +-
.../CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll | 92 +-
.../CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll | 391 +-
.../CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll | 200 +-
.../CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll | 377 +-
.../AMDGPU/shufflevector.v4f32.v2f32.ll | 45 +-
.../AMDGPU/shufflevector.v4f32.v3f32.ll | 1474 +--
.../AMDGPU/shufflevector.v4f32.v4f32.ll | 518 +-
.../AMDGPU/shufflevector.v4i32.v2i32.ll | 45 +-
.../AMDGPU/shufflevector.v4i32.v3i32.ll | 1474 +--
.../AMDGPU/shufflevector.v4i32.v4i32.ll | 518 +-
.../AMDGPU/shufflevector.v4i64.v2i64.ll | 364 +-
.../AMDGPU/shufflevector.v4i64.v3i64.ll | 204 +-
.../AMDGPU/shufflevector.v4i64.v4i64.ll | 40 +-
.../CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll | 364 +-
.../CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll | 204 +-
.../CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll | 40 +-
.../CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll | 45 +-
.../CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll | 1474 +--
.../CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll | 518 +-
.../AMDGPU/undef-handling-crash-in-ra.ll | 23 +-
.../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 22 +-
78 files changed, 23782 insertions(+), 18530 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 7e297f46a780e..9f1955c78eb36 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -23,9 +23,9 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_add_noret_f64:
@@ -34,9 +34,9 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_buffer_atomic_add_noret_f64:
@@ -142,9 +142,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8)
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
@@ -153,9 +153,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8)
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
@@ -261,9 +261,9 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_add_noret_f64:
@@ -272,9 +272,9 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_buffer_atomic_add_noret_f64:
@@ -379,9 +379,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
@@ -390,9 +390,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
@@ -497,9 +497,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_min_noret_f64:
@@ -508,9 +508,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_buffer_atomic_min_noret_f64:
@@ -616,9 +616,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
@@ -627,9 +627,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
@@ -735,9 +735,9 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_min_noret_f64:
@@ -746,9 +746,9 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_buffer_atomic_min_noret_f64:
@@ -853,9 +853,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
@@ -864,9 +864,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
@@ -971,9 +971,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_max_noret_f64:
@@ -982,9 +982,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_buffer_atomic_max_noret_f64:
@@ -1090,9 +1090,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
@@ -1101,9 +1101,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
@@ -1209,9 +1209,9 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_max_noret_f64:
@@ -1220,9 +1220,9 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_buffer_atomic_max_noret_f64:
@@ -1327,9 +1327,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
@@ -1338,9 +1338,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll
index 4c62409a85c00..2968e0441d349 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll
@@ -183,122 +183,125 @@ define void @ds_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(3) %ptr) #0 {
; CHECK-LABEL: ds_atomic_xchg_i32_ret_av_av_no_agprs:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_accvgpr_write_b32 a2, v40 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a3, v41 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a4, v42 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a5, v43 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a6, v44 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a7, v45 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a8, v46 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a9, v47 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a10, v56 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a11, v57 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a12, v58 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a13, v59 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a14, v60 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a15, v61 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a16, v62 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a17, v63 ; Reload Reuse
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[0:31]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; CHECK-NEXT: v_accvgpr_write_b32 a33, v31
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a1
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_write_b32 a18, v31 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a32, v30
+; CHECK-NEXT: v_accvgpr_write_b32 a31, v29
+; CHECK-NEXT: v_accvgpr_write_b32 a30, v28
+; CHECK-NEXT: v_accvgpr_write_b32 a29, v27
+; CHECK-NEXT: v_accvgpr_write_b32 a28, v26
+; CHECK-NEXT: v_accvgpr_write_b32 a27, v25
+; CHECK-NEXT: v_accvgpr_write_b32 a26, v24
+; CHECK-NEXT: v_accvgpr_write_b32 a25, v23
+; CHECK-NEXT: v_accvgpr_write_b32 a24, v22
+; CHECK-NEXT: v_accvgpr_write_b32 a23, v21
+; CHECK-NEXT: v_accvgpr_write_b32 a22, v20
+; CHECK-NEXT: v_accvgpr_write_b32 a21, v19
+; CHECK-NEXT: v_accvgpr_write_b32 a20, v18
+; CHECK-NEXT: v_accvgpr_write_b32 a19, v17
+; CHECK-NEXT: v_accvgpr_write_b32 a18, v16
+; CHECK-NEXT: v_accvgpr_write_b32 a17, v15
+; CHECK-NEXT: v_accvgpr_write_b32 a16, v14
+; CHECK-NEXT: v_accvgpr_write_b32 a15, v13
+; CHECK-NEXT: v_accvgpr_write_b32 a14, v12
+; CHECK-NEXT: v_accvgpr_write_b32 a13, v11
+; CHECK-NEXT: v_accvgpr_write_b32 a12, v10
+; CHECK-NEXT: v_accvgpr_write_b32 a11, v9
+; CHECK-NEXT: v_accvgpr_write_b32 a10, v8
+; CHECK-NEXT: v_accvgpr_write_b32 a9, v7
+; CHECK-NEXT: v_accvgpr_write_b32 a8, v6
+; CHECK-NEXT: v_accvgpr_write_b32 a7, v5
+; CHECK-NEXT: v_accvgpr_write_b32 a6, v4
+; CHECK-NEXT: v_accvgpr_write_b32 a5, v3
+; CHECK-NEXT: v_accvgpr_write_b32 a4, v2
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v1
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v0
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
; CHECK-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_accvgpr_write_b32 a31, v18 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a30, v19 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a29, v20 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a28, v21 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a27, v22 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a26, v23 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a25, v24 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a24, v25 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a23, v26 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a22, v27 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a21, v28 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a20, v29 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a19, v30 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
-; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; CHECK-NEXT: v_accvgpr_read_b32 v18, a31 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v19, a30 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v20, a29 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v21, a28 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v22, a27 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v23, a26 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v24, a25 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v25, a24 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v26, a23 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v27, a22 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v28, a21 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v29, a20 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v30, a19 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a2
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a3
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a4
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a5
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a6
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a7
+; CHECK-NEXT: v_accvgpr_read_b32 v6, a8
+; CHECK-NEXT: v_accvgpr_read_b32 v7, a9
+; CHECK-NEXT: v_accvgpr_read_b32 v8, a10
+; CHECK-NEXT: v_accvgpr_read_b32 v9, a11
+; CHECK-NEXT: v_accvgpr_read_b32 v10, a12
+; CHECK-NEXT: v_accvgpr_read_b32 v11, a13
+; CHECK-NEXT: v_accvgpr_read_b32 v12, a14
+; CHECK-NEXT: v_accvgpr_read_b32 v13, a15
+; CHECK-NEXT: v_accvgpr_read_b32 v14, a16
+; CHECK-NEXT: v_accvgpr_read_b32 v15, a17
+; CHECK-NEXT: v_accvgpr_read_b32 v16, a18
+; CHECK-NEXT: v_accvgpr_read_b32 v17, a19
+; CHECK-NEXT: v_accvgpr_read_b32 v18, a20
+; CHECK-NEXT: v_accvgpr_read_b32 v19, a21
+; CHECK-NEXT: v_accvgpr_read_b32 v20, a22
+; CHECK-NEXT: v_accvgpr_read_b32 v21, a23
+; CHECK-NEXT: v_accvgpr_read_b32 v22, a24
+; CHECK-NEXT: v_accvgpr_read_b32 v23, a25
+; CHECK-NEXT: v_accvgpr_read_b32 v24, a26
+; CHECK-NEXT: v_accvgpr_read_b32 v25, a27
+; CHECK-NEXT: v_accvgpr_read_b32 v26, a28
+; CHECK-NEXT: v_accvgpr_read_b32 v27, a29
+; CHECK-NEXT: v_accvgpr_read_b32 v28, a30
+; CHECK-NEXT: v_accvgpr_read_b32 v29, a31
+; CHECK-NEXT: v_accvgpr_read_b32 v30, a32
+; CHECK-NEXT: v_accvgpr_read_b32 v31, a33
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ; use v[0:31]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_accvgpr_read_b32 v31, a18 ; Reload Reuse
+; CHECK-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use v[0:31]
+; CHECK-NEXT: ; use a0
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_read_b32 v63, a17 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v62, a16 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v61, a15 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v60, a14 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v59, a13 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v58, a12 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v57, a11 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v56, a10 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v47, a9 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v46, a8 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v45, a7 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v44, a6 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v43, a5 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v42, a4 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v41, a3 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v40, a2 ; Reload Reuse
+; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
%data = call i32 asm "; def $0", "=^VA"()
@@ -744,122 +747,125 @@ define void @ds_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(3) %ptr) #0 {
; CHECK-LABEL: ds_atomic_xor_i32_ret_av_av_no_agprs:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_accvgpr_write_b32 a2, v40 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a3, v41 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a4, v42 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a5, v43 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a6, v44 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a7, v45 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a8, v46 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a9, v47 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a10, v56 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a11, v57 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a12, v58 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a13, v59 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a14, v60 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a15, v61 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a16, v62 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a17, v63 ; Reload Reuse
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[0:31]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; CHECK-NEXT: v_accvgpr_write_b32 a33, v31
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a1
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_write_b32 a18, v31 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a32, v30
+; CHECK-NEXT: v_accvgpr_write_b32 a31, v29
+; CHECK-NEXT: v_accvgpr_write_b32 a30, v28
+; CHECK-NEXT: v_accvgpr_write_b32 a29, v27
+; CHECK-NEXT: v_accvgpr_write_b32 a28, v26
+; CHECK-NEXT: v_accvgpr_write_b32 a27, v25
+; CHECK-NEXT: v_accvgpr_write_b32 a26, v24
+; CHECK-NEXT: v_accvgpr_write_b32 a25, v23
+; CHECK-NEXT: v_accvgpr_write_b32 a24, v22
+; CHECK-NEXT: v_accvgpr_write_b32 a23, v21
+; CHECK-NEXT: v_accvgpr_write_b32 a22, v20
+; CHECK-NEXT: v_accvgpr_write_b32 a21, v19
+; CHECK-NEXT: v_accvgpr_write_b32 a20, v18
+; CHECK-NEXT: v_accvgpr_write_b32 a19, v17
+; CHECK-NEXT: v_accvgpr_write_b32 a18, v16
+; CHECK-NEXT: v_accvgpr_write_b32 a17, v15
+; CHECK-NEXT: v_accvgpr_write_b32 a16, v14
+; CHECK-NEXT: v_accvgpr_write_b32 a15, v13
+; CHECK-NEXT: v_accvgpr_write_b32 a14, v12
+; CHECK-NEXT: v_accvgpr_write_b32 a13, v11
+; CHECK-NEXT: v_accvgpr_write_b32 a12, v10
+; CHECK-NEXT: v_accvgpr_write_b32 a11, v9
+; CHECK-NEXT: v_accvgpr_write_b32 a10, v8
+; CHECK-NEXT: v_accvgpr_write_b32 a9, v7
+; CHECK-NEXT: v_accvgpr_write_b32 a8, v6
+; CHECK-NEXT: v_accvgpr_write_b32 a7, v5
+; CHECK-NEXT: v_accvgpr_write_b32 a6, v4
+; CHECK-NEXT: v_accvgpr_write_b32 a5, v3
+; CHECK-NEXT: v_accvgpr_write_b32 a4, v2
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v1
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v0
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
; CHECK-NEXT: ds_xor_rtn_b32 v0, v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_accvgpr_write_b32 a31, v18 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a30, v19 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a29, v20 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a28, v21 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a27, v22 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a26, v23 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a25, v24 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a24, v25 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a23, v26 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a22, v27 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a21, v28 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a20, v29 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_write_b32 a19, v30 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
-; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; CHECK-NEXT: v_accvgpr_read_b32 v18, a31 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v19, a30 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v20, a29 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v21, a28 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v22, a27 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v23, a26 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v24, a25 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v25, a24 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v26, a23 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v27, a22 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v28, a21 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v29, a20 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v30, a19 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a2
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a3
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a4
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a5
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a6
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a7
+; CHECK-NEXT: v_accvgpr_read_b32 v6, a8
+; CHECK-NEXT: v_accvgpr_read_b32 v7, a9
+; CHECK-NEXT: v_accvgpr_read_b32 v8, a10
+; CHECK-NEXT: v_accvgpr_read_b32 v9, a11
+; CHECK-NEXT: v_accvgpr_read_b32 v10, a12
+; CHECK-NEXT: v_accvgpr_read_b32 v11, a13
+; CHECK-NEXT: v_accvgpr_read_b32 v12, a14
+; CHECK-NEXT: v_accvgpr_read_b32 v13, a15
+; CHECK-NEXT: v_accvgpr_read_b32 v14, a16
+; CHECK-NEXT: v_accvgpr_read_b32 v15, a17
+; CHECK-NEXT: v_accvgpr_read_b32 v16, a18
+; CHECK-NEXT: v_accvgpr_read_b32 v17, a19
+; CHECK-NEXT: v_accvgpr_read_b32 v18, a20
+; CHECK-NEXT: v_accvgpr_read_b32 v19, a21
+; CHECK-NEXT: v_accvgpr_read_b32 v20, a22
+; CHECK-NEXT: v_accvgpr_read_b32 v21, a23
+; CHECK-NEXT: v_accvgpr_read_b32 v22, a24
+; CHECK-NEXT: v_accvgpr_read_b32 v23, a25
+; CHECK-NEXT: v_accvgpr_read_b32 v24, a26
+; CHECK-NEXT: v_accvgpr_read_b32 v25, a27
+; CHECK-NEXT: v_accvgpr_read_b32 v26, a28
+; CHECK-NEXT: v_accvgpr_read_b32 v27, a29
+; CHECK-NEXT: v_accvgpr_read_b32 v28, a30
+; CHECK-NEXT: v_accvgpr_read_b32 v29, a31
+; CHECK-NEXT: v_accvgpr_read_b32 v30, a32
+; CHECK-NEXT: v_accvgpr_read_b32 v31, a33
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ; use v[0:31]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_accvgpr_read_b32 v31, a18 ; Reload Reuse
+; CHECK-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use v[0:31]
+; CHECK-NEXT: ; use a0
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_read_b32 v63, a17 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v62, a16 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v61, a15 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v60, a14 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v59, a13 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v58, a12 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v57, a11 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v56, a10 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v47, a9 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v46, a8 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v45, a7 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v44, a6 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v43, a5 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v42, a4 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v41, a3 ; Reload Reuse
-; CHECK-NEXT: v_accvgpr_read_b32 v40, a2 ; Reload Reuse
+; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
%data = call i32 asm "; def $0", "=^VA"()
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
index bc341f2baa804..e9192ca2d03ac 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
@@ -472,49 +472,46 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__a(ptr %ptr) #0 {
; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_av_av__a:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
+; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
-; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[2:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: ; implicit-def: $agpr0_agpr1
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
; CHECK-NEXT: buffer_wbl2
-; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_invl2
; CHECK-NEXT: buffer_wbinvl1_vol
-; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
-; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
-; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
; CHECK-NEXT: .LBB14_2: ; %Flow
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB14_4
; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: v_accvgpr_write_b32 a0, v4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; CHECK-NEXT: v_accvgpr_write_b32 a1, v5
; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; CHECK-NEXT: .LBB14_4: ; %atomicrmw.phi
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v4
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v5
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -533,53 +530,50 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__a(ptr %ptr) #0 {
; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_a_a__a:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
+; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
-; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
-; CHECK-NEXT: ; implicit-def: $agpr0_agpr1
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB15_2
; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
; CHECK-NEXT: buffer_wbl2
-; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_invl2
; CHECK-NEXT: buffer_wbinvl1_vol
-; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
-; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
-; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
; CHECK-NEXT: .LBB15_2: ; %Flow
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB15_4
; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: v_accvgpr_write_b32 a0, v4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; CHECK-NEXT: v_accvgpr_write_b32 a1, v5
; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; CHECK-NEXT: .LBB15_4: ; %atomicrmw.phi
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v4
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v5
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -774,49 +768,46 @@ define void @flat_atomic_cmpxchg_i64_ret_v_v__a(ptr %ptr) #0 {
; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_v_v__a:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
+; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
-; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[2:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: ; implicit-def: $agpr0_agpr1
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB19_2
; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
; CHECK-NEXT: buffer_wbl2
-; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_invl2
; CHECK-NEXT: buffer_wbinvl1_vol
-; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
-; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
-; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
; CHECK-NEXT: .LBB19_2: ; %Flow
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB19_4
; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
-; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: v_accvgpr_write_b32 a0, v4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; CHECK-NEXT: v_accvgpr_write_b32 a1, v5
; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; CHECK-NEXT: .LBB19_4: ; %atomicrmw.phi
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v4
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v5
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:1]
; CHECK-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index d053425afbb6d..4a8225fcd6ad2 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -338,225 +338,264 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def a2
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a33, v31
+; GFX90A-NEXT: v_accvgpr_write_b32 a32, v30
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v29
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v28
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v27
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v26
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v25
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v23
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v22
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v21
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v20
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v19
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v18
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v17
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v16
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v15
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a34
+; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a34
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a4
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a5
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a6
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a7
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a8
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a9
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a10
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a11
+; GFX90A-NEXT: v_accvgpr_read_b32 v10, a12
+; GFX90A-NEXT: v_accvgpr_read_b32 v11, a13
+; GFX90A-NEXT: v_accvgpr_read_b32 v12, a14
+; GFX90A-NEXT: v_accvgpr_read_b32 v13, a15
+; GFX90A-NEXT: v_accvgpr_read_b32 v14, a16
+; GFX90A-NEXT: v_accvgpr_read_b32 v15, a17
+; GFX90A-NEXT: v_accvgpr_read_b32 v16, a18
+; GFX90A-NEXT: v_accvgpr_read_b32 v17, a19
+; GFX90A-NEXT: v_accvgpr_read_b32 v18, a20
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a21
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a22
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a23
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a24
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a25
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a27
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a28
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a29
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a30
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a31
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a32
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a33
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use a0
+; GFX90A-NEXT: ; use v[0:31]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[0:31]
+; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill
-; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill
-; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def a2
+; GFX950-NEXT: ; def a34
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_accvgpr_write_b32 a33, v31
+; GFX950-NEXT: v_accvgpr_write_b32 a32, v30
+; GFX950-NEXT: v_accvgpr_write_b32 a31, v29
+; GFX950-NEXT: v_accvgpr_write_b32 a30, v28
+; GFX950-NEXT: v_accvgpr_write_b32 a29, v27
+; GFX950-NEXT: v_accvgpr_write_b32 a28, v26
+; GFX950-NEXT: v_accvgpr_write_b32 a27, v25
+; GFX950-NEXT: v_accvgpr_write_b32 a26, v24
+; GFX950-NEXT: v_accvgpr_write_b32 a25, v23
+; GFX950-NEXT: v_accvgpr_write_b32 a24, v22
+; GFX950-NEXT: v_accvgpr_write_b32 a23, v21
+; GFX950-NEXT: v_accvgpr_write_b32 a22, v20
+; GFX950-NEXT: v_accvgpr_write_b32 a21, v19
+; GFX950-NEXT: v_accvgpr_write_b32 a20, v18
+; GFX950-NEXT: v_accvgpr_write_b32 a19, v17
+; GFX950-NEXT: v_accvgpr_write_b32 a18, v16
+; GFX950-NEXT: v_accvgpr_write_b32 a17, v15
+; GFX950-NEXT: v_accvgpr_write_b32 a16, v14
+; GFX950-NEXT: v_accvgpr_write_b32 a15, v13
+; GFX950-NEXT: v_accvgpr_write_b32 a14, v12
+; GFX950-NEXT: v_accvgpr_write_b32 a13, v11
+; GFX950-NEXT: v_accvgpr_write_b32 a12, v10
+; GFX950-NEXT: v_accvgpr_write_b32 a11, v9
+; GFX950-NEXT: v_accvgpr_write_b32 a10, v8
+; GFX950-NEXT: v_accvgpr_write_b32 a9, v7
+; GFX950-NEXT: v_accvgpr_write_b32 a8, v6
+; GFX950-NEXT: v_accvgpr_write_b32 a7, v5
+; GFX950-NEXT: v_accvgpr_write_b32 a6, v4
+; GFX950-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a3, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a2, v0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a34
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload
-; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload
-; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload
-; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload
-; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
-; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload
-; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a2
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a3
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a4
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a5
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a6
+; GFX950-NEXT: v_accvgpr_read_b32 v5, a7
+; GFX950-NEXT: v_accvgpr_read_b32 v6, a8
+; GFX950-NEXT: v_accvgpr_read_b32 v7, a9
+; GFX950-NEXT: v_accvgpr_read_b32 v8, a10
+; GFX950-NEXT: v_accvgpr_read_b32 v9, a11
+; GFX950-NEXT: v_accvgpr_read_b32 v10, a12
+; GFX950-NEXT: v_accvgpr_read_b32 v11, a13
+; GFX950-NEXT: v_accvgpr_read_b32 v12, a14
+; GFX950-NEXT: v_accvgpr_read_b32 v13, a15
+; GFX950-NEXT: v_accvgpr_read_b32 v14, a16
+; GFX950-NEXT: v_accvgpr_read_b32 v15, a17
+; GFX950-NEXT: v_accvgpr_read_b32 v16, a18
+; GFX950-NEXT: v_accvgpr_read_b32 v17, a19
+; GFX950-NEXT: v_accvgpr_read_b32 v18, a20
+; GFX950-NEXT: v_accvgpr_read_b32 v19, a21
+; GFX950-NEXT: v_accvgpr_read_b32 v20, a22
+; GFX950-NEXT: v_accvgpr_read_b32 v21, a23
+; GFX950-NEXT: v_accvgpr_read_b32 v22, a24
+; GFX950-NEXT: v_accvgpr_read_b32 v23, a25
+; GFX950-NEXT: v_accvgpr_read_b32 v24, a26
+; GFX950-NEXT: v_accvgpr_read_b32 v25, a27
+; GFX950-NEXT: v_accvgpr_read_b32 v26, a28
+; GFX950-NEXT: v_accvgpr_read_b32 v27, a29
+; GFX950-NEXT: v_accvgpr_read_b32 v28, a30
+; GFX950-NEXT: v_accvgpr_read_b32 v29, a31
+; GFX950-NEXT: v_accvgpr_read_b32 v30, a32
+; GFX950-NEXT: v_accvgpr_read_b32 v31, a33
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use a0
+; GFX950-NEXT: ; use v[0:31]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[0:31]
+; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
@@ -640,43 +679,43 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB11_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB11_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB11_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB11_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -688,39 +727,39 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def a[2:3]
+; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB11_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1
+; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: ; implicit-def: $agpr2_agpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB11_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB11_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off
+; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: scratch_store_dwordx2 v0, a[2:3], off
+; GFX950-NEXT: scratch_store_dwordx2 v2, a[0:1], off
; GFX950-NEXT: .LBB11_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -832,41 +871,41 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_v_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB13_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB13_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB13_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB13_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -878,37 +917,37 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[2:3]
+; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB13_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1
+; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB13_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB13_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off
+; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off
+; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off
; GFX950-NEXT: .LBB13_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -926,41 +965,40 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB14_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB14_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB14_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB14_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[0:1]
+; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1016,41 +1054,40 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB15_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB15_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB15_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB15_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[0:1]
+; GFX90A-NEXT: ; use v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1106,41 +1143,40 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB16_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB16_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB16_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB16_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -1152,37 +1188,37 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[2:3]
+; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB16_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1
+; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB16_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB16_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off
+; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off
+; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off
; GFX950-NEXT: .LBB16_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -1384,12 +1420,10 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB19_3
@@ -1406,14 +1440,13 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB19_2
; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword a1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword a0, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1483,7 +1516,6 @@ define void @flat_atomic_xchg_i64_noret_av(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB20_2
; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.private
@@ -1560,12 +1592,12 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1590,12 +1622,12 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB21_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -1695,12 +1727,12 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1724,12 +1756,12 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB23_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -1892,12 +1924,12 @@ define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1921,12 +1953,12 @@ define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB26_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -2494,7 +2526,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB32_4
@@ -2512,9 +2544,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB32_2
@@ -2528,18 +2558,18 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB32_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -2556,7 +2586,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB32_4
@@ -2573,9 +2603,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB32_2
@@ -2590,15 +2618,15 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7
-; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7
+; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB32_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -2742,7 +2770,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB34_4
@@ -2760,9 +2788,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB34_2
@@ -2776,18 +2802,18 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 {
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB34_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -2802,7 +2828,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB34_4
@@ -2819,9 +2845,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB34_2
@@ -2836,15 +2860,15 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7
-; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7
+; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB34_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -3101,7 +3125,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB37_4
@@ -3119,9 +3143,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB37_2
@@ -3135,18 +3157,18 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 {
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB37_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -3161,7 +3183,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB37_4
@@ -3178,9 +3200,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB37_2
@@ -3195,15 +3215,15 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7
-; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7
+; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB37_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -4008,223 +4028,262 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def a2
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a33, v31
+; GFX90A-NEXT: v_accvgpr_write_b32 a32, v30
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v29
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v28
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v27
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v26
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v25
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v23
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v22
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v21
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v20
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v19
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v18
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v17
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v16
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v15
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a34
+; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a34
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a4
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a5
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a6
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a7
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a8
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a9
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a10
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a11
+; GFX90A-NEXT: v_accvgpr_read_b32 v10, a12
+; GFX90A-NEXT: v_accvgpr_read_b32 v11, a13
+; GFX90A-NEXT: v_accvgpr_read_b32 v12, a14
+; GFX90A-NEXT: v_accvgpr_read_b32 v13, a15
+; GFX90A-NEXT: v_accvgpr_read_b32 v14, a16
+; GFX90A-NEXT: v_accvgpr_read_b32 v15, a17
+; GFX90A-NEXT: v_accvgpr_read_b32 v16, a18
+; GFX90A-NEXT: v_accvgpr_read_b32 v17, a19
+; GFX90A-NEXT: v_accvgpr_read_b32 v18, a20
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a21
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a22
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a23
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a24
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a25
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a27
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a28
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a29
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a30
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a31
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a32
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a33
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use a0
+; GFX90A-NEXT: ; use v[0:31]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[0:31]
+; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill
-; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill
-; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def a2
+; GFX950-NEXT: ; def a34
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_accvgpr_write_b32 a33, v31
+; GFX950-NEXT: v_accvgpr_write_b32 a32, v30
+; GFX950-NEXT: v_accvgpr_write_b32 a31, v29
+; GFX950-NEXT: v_accvgpr_write_b32 a30, v28
+; GFX950-NEXT: v_accvgpr_write_b32 a29, v27
+; GFX950-NEXT: v_accvgpr_write_b32 a28, v26
+; GFX950-NEXT: v_accvgpr_write_b32 a27, v25
+; GFX950-NEXT: v_accvgpr_write_b32 a26, v24
+; GFX950-NEXT: v_accvgpr_write_b32 a25, v23
+; GFX950-NEXT: v_accvgpr_write_b32 a24, v22
+; GFX950-NEXT: v_accvgpr_write_b32 a23, v21
+; GFX950-NEXT: v_accvgpr_write_b32 a22, v20
+; GFX950-NEXT: v_accvgpr_write_b32 a21, v19
+; GFX950-NEXT: v_accvgpr_write_b32 a20, v18
+; GFX950-NEXT: v_accvgpr_write_b32 a19, v17
+; GFX950-NEXT: v_accvgpr_write_b32 a18, v16
+; GFX950-NEXT: v_accvgpr_write_b32 a17, v15
+; GFX950-NEXT: v_accvgpr_write_b32 a16, v14
+; GFX950-NEXT: v_accvgpr_write_b32 a15, v13
+; GFX950-NEXT: v_accvgpr_write_b32 a14, v12
+; GFX950-NEXT: v_accvgpr_write_b32 a13, v11
+; GFX950-NEXT: v_accvgpr_write_b32 a12, v10
+; GFX950-NEXT: v_accvgpr_write_b32 a11, v9
+; GFX950-NEXT: v_accvgpr_write_b32 a10, v8
+; GFX950-NEXT: v_accvgpr_write_b32 a9, v7
+; GFX950-NEXT: v_accvgpr_write_b32 a8, v6
+; GFX950-NEXT: v_accvgpr_write_b32 a7, v5
+; GFX950-NEXT: v_accvgpr_write_b32 a6, v4
+; GFX950-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a3, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a2, v0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a34
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
-; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload
-; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload
-; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload
-; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload
-; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
-; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload
-; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a2
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a3
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a4
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a5
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a6
+; GFX950-NEXT: v_accvgpr_read_b32 v5, a7
+; GFX950-NEXT: v_accvgpr_read_b32 v6, a8
+; GFX950-NEXT: v_accvgpr_read_b32 v7, a9
+; GFX950-NEXT: v_accvgpr_read_b32 v8, a10
+; GFX950-NEXT: v_accvgpr_read_b32 v9, a11
+; GFX950-NEXT: v_accvgpr_read_b32 v10, a12
+; GFX950-NEXT: v_accvgpr_read_b32 v11, a13
+; GFX950-NEXT: v_accvgpr_read_b32 v12, a14
+; GFX950-NEXT: v_accvgpr_read_b32 v13, a15
+; GFX950-NEXT: v_accvgpr_read_b32 v14, a16
+; GFX950-NEXT: v_accvgpr_read_b32 v15, a17
+; GFX950-NEXT: v_accvgpr_read_b32 v16, a18
+; GFX950-NEXT: v_accvgpr_read_b32 v17, a19
+; GFX950-NEXT: v_accvgpr_read_b32 v18, a20
+; GFX950-NEXT: v_accvgpr_read_b32 v19, a21
+; GFX950-NEXT: v_accvgpr_read_b32 v20, a22
+; GFX950-NEXT: v_accvgpr_read_b32 v21, a23
+; GFX950-NEXT: v_accvgpr_read_b32 v22, a24
+; GFX950-NEXT: v_accvgpr_read_b32 v23, a25
+; GFX950-NEXT: v_accvgpr_read_b32 v24, a26
+; GFX950-NEXT: v_accvgpr_read_b32 v25, a27
+; GFX950-NEXT: v_accvgpr_read_b32 v26, a28
+; GFX950-NEXT: v_accvgpr_read_b32 v27, a29
+; GFX950-NEXT: v_accvgpr_read_b32 v28, a30
+; GFX950-NEXT: v_accvgpr_read_b32 v29, a31
+; GFX950-NEXT: v_accvgpr_read_b32 v30, a32
+; GFX950-NEXT: v_accvgpr_read_b32 v31, a33
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use a0
+; GFX950-NEXT: ; use v[0:31]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[0:31]
+; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
@@ -4308,39 +4367,37 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB53_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB53_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB53_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
-; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4
+; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB53_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -4355,37 +4412,35 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB53_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc1
-; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB53_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB53_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3
-; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5
+; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4
+; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB53_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -4497,38 +4552,36 @@ define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB55_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB55_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB55_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
-; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4
+; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB55_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -4541,37 +4594,35 @@ define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[2:3]
+; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB55_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc1
-; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB55_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB55_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3
-; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5
+; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4
+; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB55_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -4766,38 +4817,36 @@ define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB58_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB58_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB58_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
-; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4
+; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB58_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -4810,37 +4859,35 @@ define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[2:3]
+; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB58_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc1
-; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB58_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB58_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3
-; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5
+; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4
+; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB58_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -5440,13 +5487,13 @@ define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB69_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -5469,12 +5516,12 @@ define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB69_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6062,13 +6109,13 @@ define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB85_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -6093,13 +6140,13 @@ define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB85_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6193,12 +6240,12 @@ define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB87_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -6221,12 +6268,12 @@ define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB87_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6305,48 +6352,45 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_add_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB89_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB89_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB89_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v1, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v0, v4
+; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB89_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_add_i64_ret_a_a:
@@ -6354,43 +6398,41 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB89_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] sc0
-; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB89_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB89_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5]
+; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB89_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -6488,48 +6530,45 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_sub_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB91_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB91_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB91_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v1, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
-; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
+; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB91_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_sub_i64_ret_a_a:
@@ -6537,45 +6576,43 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB91_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] sc0
-; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB91_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB91_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
+; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB91_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -6675,48 +6712,45 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_and_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB93_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB93_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB93_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_and_b32_e32 v2, v4, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
-; GFX90A-NEXT: v_and_b32_e32 v3, v1, v3
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5
+; GFX90A-NEXT: v_and_b32_e32 v4, v0, v4
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB93_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_and_i64_ret_a_a:
@@ -6724,44 +6758,42 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB93_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] sc0
-; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB93_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB93_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_and_b32_e32 v3, v1, v3
-; GFX950-NEXT: v_and_b32_e32 v2, v0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: v_and_b32_e32 v3, v1, v5
+; GFX950-NEXT: v_and_b32_e32 v2, v0, v4
+; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB93_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -6869,7 +6901,7 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB95_4
@@ -6886,8 +6918,6 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -6901,21 +6931,21 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_cbranch_execz .LBB95_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_and_b32_e32 v3, v1, v7
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_and_b32_e32 v4, v2, v6
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_not_b32_e32 v2, v3
-; GFX90A-NEXT: v_not_b32_e32 v3, v4
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_and_b32_e32 v4, v0, v6
+; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_not_b32_e32 v3, v3
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB95_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -6934,7 +6964,7 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB95_4
@@ -6951,8 +6981,6 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -6972,13 +7000,13 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v2, v1, v7
; GFX950-NEXT: v_and_b32_e32 v5, v0, v6
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_not_b32_e32 v3, v2
; GFX950-NEXT: v_not_b32_e32 v2, v5
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB95_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -7118,48 +7146,45 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_or_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB97_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB97_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB97_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
-; GFX90A-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_or_b32_e32 v3, v1, v5
+; GFX90A-NEXT: v_or_b32_e32 v4, v0, v4
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB97_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_or_i64_ret_a_a:
@@ -7167,44 +7192,42 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB97_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] sc0
-; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB97_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB97_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX950-NEXT: v_or_b32_e32 v2, v0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: v_or_b32_e32 v3, v1, v5
+; GFX950-NEXT: v_or_b32_e32 v2, v0, v4
+; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB97_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -7309,43 +7332,40 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB99_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB99_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB99_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB99_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_max_i64_ret_a_a:
@@ -7353,46 +7373,44 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB99_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] sc0
-; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB99_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB99_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB99_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -7500,43 +7518,40 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB101_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB101_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB101_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB101_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_min_i64_ret_a_a:
@@ -7544,46 +7559,44 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB101_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] sc0
-; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB101_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB101_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB101_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -7691,43 +7704,40 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB103_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB103_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB103_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB103_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umax_i64_ret_a_a:
@@ -7735,46 +7745,44 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB103_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] sc0
-; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB103_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB103_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB103_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -7882,43 +7890,40 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB105_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB105_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB105_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB105_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umin_i64_ret_a_a:
@@ -7926,46 +7931,44 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB105_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] sc0
-; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB105_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB105_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB105_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -8073,45 +8076,42 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB107_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB107_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB107_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc
-; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB107_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_uinc_wrap_i64_ret_a_a:
@@ -8119,46 +8119,45 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB107_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] sc0
-; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB107_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB107_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
-; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
+; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB107_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -8263,53 +8262,50 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_udec_wrap_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB109_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB109_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB109_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1]
+; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB109_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_udec_wrap_i64_ret_a_a:
@@ -8317,48 +8313,46 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB109_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] sc0
-; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB109_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB109_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1]
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB109_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -8466,64 +8460,62 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_cond_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB111_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB111_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
-; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
-; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6
+; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
+; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[6:7]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB111_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX90A-NEXT: .LBB111_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB111_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
+; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v6
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
-; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v7, vcc
+; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[6:7]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB111_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -8542,7 +8534,7 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB111_4
@@ -8562,8 +8554,6 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -8585,13 +8575,14 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB111_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -8748,7 +8739,7 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB113_4
@@ -8766,8 +8757,6 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -8789,14 +8778,14 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB113_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -8815,7 +8804,7 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB113_4
@@ -8835,8 +8824,6 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -8858,13 +8845,14 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB113_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -9022,55 +9010,53 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $agpr0
+; GFX90A-NEXT: ; implicit-def: $vgpr2
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB115_6
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
-; GFX90A-NEXT: ; implicit-def: $agpr0
+; GFX90A-NEXT: ; implicit-def: $vgpr2
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB115_3
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
-; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
-; GFX90A-NEXT: ; implicit-def: $vgpr2
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: global_atomic_add_f32 v2, v[0:1], v3, off glc
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr3
; GFX90A-NEXT: .LBB115_3: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB115_5
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f32_e32 v2, v1, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_add_f32_e32 v1, v2, v3
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX90A-NEXT: .LBB115_5: ; %Flow1
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: ; implicit-def: $vgpr3
; GFX90A-NEXT: .LBB115_6: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB115_8
; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: ds_add_rtn_f32 v2, v0, v3
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: .LBB115_8: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_f32_ret_a_a:
@@ -9189,12 +9175,12 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB117_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9217,12 +9203,12 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB117_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -9312,13 +9298,13 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB119_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9342,13 +9328,13 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB119_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -9442,13 +9428,13 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB121_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9472,13 +9458,13 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB121_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -9573,13 +9559,13 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB123_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9602,12 +9588,12 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB123_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -9701,13 +9687,13 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB125_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9730,12 +9716,12 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB125_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -9817,68 +9803,63 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_f64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB127_6
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v3
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB127_3
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
-; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off glc
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB127_3: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB127_5
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5]
+; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB127_5: ; %Flow1
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB127_6: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB127_8
; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc
+; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: .LBB127_8: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_f64_ret_a_a:
@@ -9886,65 +9867,61 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base
-; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB127_6
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v1
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v3
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX950-NEXT: s_cbranch_execz .LBB127_3
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global
-; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
+; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off sc0
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB127_3: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3]
; GFX950-NEXT: s_cbranch_execz .LBB127_5
; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5]
+; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB127_5: ; %Flow1
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB127_6: ; %Flow2
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB127_8
; GFX950-NEXT: ; %bb.7: ; %atomicrmw.shared
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc
+; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: .LBB127_8: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
@@ -10089,7 +10066,7 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB129_4
@@ -10103,9 +10080,7 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB129_2
@@ -10121,15 +10096,14 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB129_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -10148,7 +10122,7 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB129_4
@@ -10162,9 +10136,7 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB129_2
@@ -10181,12 +10153,12 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB129_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -10313,49 +10285,46 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_f64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB131_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB131_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB131_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB131_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_f64_ret_a_a:
@@ -10363,45 +10332,43 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB131_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
-; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_max_f64 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB131_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB131_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
+; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB131_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
@@ -10502,49 +10469,46 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_f64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB133_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB133_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB133_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB133_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_f64_ret_a_a:
@@ -10552,45 +10516,43 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB133_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
-; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_min_f64 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB133_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB133_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
+; GFX950-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB133_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
@@ -10700,7 +10662,7 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB135_4
@@ -10718,8 +10680,6 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -10737,18 +10697,17 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB135_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -10767,7 +10726,7 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB135_4
@@ -10786,8 +10745,6 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -10808,13 +10765,14 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB135_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -10968,7 +10926,7 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB137_4
@@ -10986,8 +10944,6 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -11005,18 +10961,17 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB137_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -11035,7 +10990,7 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB137_4
@@ -11054,8 +11009,6 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -11076,13 +11029,14 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB137_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -11245,12 +11199,12 @@ define void @flat_atomic_fadd_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB139_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11342,12 +11296,12 @@ define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB141_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11370,12 +11324,12 @@ define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB141_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -11465,13 +11419,13 @@ define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB143_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11496,13 +11450,13 @@ define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB143_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -11597,13 +11551,13 @@ define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB145_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11628,13 +11582,13 @@ define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB145_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -11734,13 +11688,13 @@ define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB147_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11763,12 +11717,12 @@ define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB147_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -11870,13 +11824,13 @@ define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB149_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11899,12 +11853,12 @@ define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB149_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12021,13 +11975,13 @@ define void @flat_atomic_fadd_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB151_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12154,13 +12108,13 @@ define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB153_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12188,13 +12142,13 @@ define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB153_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12324,13 +12278,13 @@ define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB155_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12358,13 +12312,13 @@ define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB155_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12494,13 +12448,13 @@ define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB157_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12528,13 +12482,13 @@ define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB157_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12669,13 +12623,13 @@ define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB159_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12703,13 +12657,13 @@ define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB159_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12849,13 +12803,13 @@ define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB161_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12883,13 +12837,13 @@ define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB161_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13328,13 +13282,13 @@ define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB171_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13358,12 +13312,12 @@ define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB171_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14089,13 +14043,13 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB189_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14121,13 +14075,13 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB189_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14224,12 +14178,12 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB191_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14253,12 +14207,12 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB191_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14355,28 +14309,26 @@ define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB193_3
; GFX90A-NEXT: s_branch .LBB193_4
; GFX90A-NEXT: .LBB193_2:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB193_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: buffer_load_dword a0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword a1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB193_4: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(2)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i64_saddr_ret_a_a:
@@ -14398,25 +14350,23 @@ define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a3, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a2, v0
; GFX950-NEXT: s_cbranch_execz .LBB193_3
; GFX950-NEXT: s_branch .LBB193_4
; GFX950-NEXT: .LBB193_2:
-; GFX950-NEXT: ; implicit-def: $agpr2_agpr3
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB193_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 a[2:3], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: scratch_store_dwordx2 off, a[0:1], s0
; GFX950-NEXT: .LBB193_4: ; %atomicrmw.end
-; GFX950-NEXT: s_waitcnt vmcnt(1)
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use a[2:3]
+; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -14523,32 +14473,28 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB195_3
; GFX90A-NEXT: s_branch .LBB195_4
; GFX90A-NEXT: .LBB195_2:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB195_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB195_4: ; %atomicrmw.end
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_add_i64_saddr_ret_a_a:
@@ -14570,27 +14516,24 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB195_3
; GFX950-NEXT: s_branch .LBB195_4
; GFX950-NEXT: .LBB195_2:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB195_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB195_4: ; %atomicrmw.end
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -14700,32 +14643,28 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB197_3
; GFX90A-NEXT: s_branch .LBB197_4
; GFX90A-NEXT: .LBB197_2:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB197_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
-; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
+; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB197_4: ; %atomicrmw.end
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_sub_i64_saddr_ret_a_a:
@@ -14739,37 +14678,34 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: s_cbranch_vccz .LBB197_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB197_3
; GFX950-NEXT: s_branch .LBB197_4
; GFX950-NEXT: .LBB197_2:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB197_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB197_4: ; %atomicrmw.end
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -14881,32 +14817,28 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB199_3
; GFX90A-NEXT: s_branch .LBB199_4
; GFX90A-NEXT: .LBB199_2:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB199_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_and_b32_e32 v0, v4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
; GFX90A-NEXT: v_and_b32_e32 v1, v3, v1
-; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_and_b32_e32 v0, v2, v0
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB199_4: ; %atomicrmw.end
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_and_i64_saddr_ret_a_a:
@@ -14920,36 +14852,33 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: s_cbranch_vccz .LBB199_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB199_3
; GFX950-NEXT: s_branch .LBB199_4
; GFX950-NEXT: .LBB199_2:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB199_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_and_b32_e32 v1, v3, v1
-; GFX950-NEXT: v_and_b32_e32 v0, v2, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT: v_and_b32_e32 v3, v1, v3
+; GFX950-NEXT: v_and_b32_e32 v2, v0, v2
+; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB199_4: ; %atomicrmw.end
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -15070,8 +14999,6 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -15080,25 +15007,25 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB201_6
; GFX90A-NEXT: .LBB201_4:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_cbranch_execz .LBB201_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v0, s4
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_and_b32_e32 v4, v2, v4
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_not_b32_e32 v2, v3
-; GFX90A-NEXT: v_not_b32_e32 v3, v4
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_and_b32_e32 v4, v0, v4
+; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_not_b32_e32 v3, v3
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB201_6: ; %atomicrmw.phi
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -15134,8 +15061,6 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -15144,7 +15069,7 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB201_6
; GFX950-NEXT: .LBB201_4:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_cbranch_execz .LBB201_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
@@ -15153,12 +15078,12 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v2, v1, v5
; GFX950-NEXT: v_and_b32_e32 v4, v0, v4
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_not_b32_e32 v3, v2
; GFX950-NEXT: v_not_b32_e32 v2, v4
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB201_6: ; %atomicrmw.phi
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -15310,32 +15235,28 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB203_3
; GFX90A-NEXT: s_branch .LBB203_4
; GFX90A-NEXT: .LBB203_2:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB203_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
; GFX90A-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB203_4: ; %atomicrmw.end
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_or_i64_saddr_ret_a_a:
@@ -15349,36 +15270,33 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: s_cbranch_vccz .LBB203_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB203_3
; GFX950-NEXT: s_branch .LBB203_4
; GFX950-NEXT: .LBB203_2:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB203_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX950-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT: v_or_b32_e32 v3, v1, v3
+; GFX950-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB203_4: ; %atomicrmw.end
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -15489,32 +15407,28 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB205_3
; GFX90A-NEXT: s_branch .LBB205_4
; GFX90A-NEXT: .LBB205_2:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB205_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_xor_b32_e32 v0, v4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v1
-; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_xor_b32_e32 v0, v2, v0
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB205_4: ; %atomicrmw.end
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i64_saddr_ret_a_a:
@@ -15528,36 +15442,33 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: s_cbranch_vccz .LBB205_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB205_3
; GFX950-NEXT: s_branch .LBB205_4
; GFX950-NEXT: .LBB205_2:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB205_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_xor_b32_e32 v1, v3, v1
-; GFX950-NEXT: v_xor_b32_e32 v0, v2, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3
+; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2
+; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB205_4: ; %atomicrmw.end
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -15668,33 +15579,29 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB207_3
; GFX90A-NEXT: s_branch .LBB207_4
; GFX90A-NEXT: .LBB207_2:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB207_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB207_4: ; %atomicrmw.end
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_max_i64_saddr_ret_a_a:
@@ -15708,38 +15615,35 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: s_cbranch_vccz .LBB207_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB207_3
; GFX950-NEXT: s_branch .LBB207_4
; GFX950-NEXT: .LBB207_2:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB207_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB207_4: ; %atomicrmw.end
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -15853,33 +15757,29 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB209_3
; GFX90A-NEXT: s_branch .LBB209_4
; GFX90A-NEXT: .LBB209_2:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB209_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB209_4: ; %atomicrmw.end
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_min_i64_saddr_ret_a_a:
@@ -15893,38 +15793,35 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: s_cbranch_vccz .LBB209_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB209_3
; GFX950-NEXT: s_branch .LBB209_4
; GFX950-NEXT: .LBB209_2:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB209_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB209_4: ; %atomicrmw.end
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -16038,33 +15935,29 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB211_3
; GFX90A-NEXT: s_branch .LBB211_4
; GFX90A-NEXT: .LBB211_2:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB211_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB211_4: ; %atomicrmw.end
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umax_i64_saddr_ret_a_a:
@@ -16078,38 +15971,35 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: s_cbranch_vccz .LBB211_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB211_3
; GFX950-NEXT: s_branch .LBB211_4
; GFX950-NEXT: .LBB211_2:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB211_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB211_4: ; %atomicrmw.end
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -16223,33 +16113,29 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB213_3
; GFX90A-NEXT: s_branch .LBB213_4
; GFX90A-NEXT: .LBB213_2:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB213_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB213_4: ; %atomicrmw.end
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umin_i64_saddr_ret_a_a:
@@ -16263,38 +16149,35 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: s_cbranch_vccz .LBB213_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB213_3
; GFX950-NEXT: s_branch .LBB213_4
; GFX950-NEXT: .LBB213_2:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB213_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB213_4: ; %atomicrmw.end
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -16400,43 +16283,39 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: s_cbranch_vccz .LBB215_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[0:1] glc
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB215_3
; GFX90A-NEXT: s_branch .LBB215_4
; GFX90A-NEXT: .LBB215_2:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB215_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v2
+; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc
-; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
+; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB215_4: ; %atomicrmw.end
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_uinc_wrap_i64_saddr_ret_a_a:
@@ -16450,38 +16329,36 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: s_cbranch_vccz .LBB215_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[0:1] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB215_3
; GFX950-NEXT: s_branch .LBB215_4
; GFX950-NEXT: .LBB215_2:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB215_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, 1
-; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
-; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
+; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB215_4: ; %atomicrmw.end
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -16598,37 +16475,33 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB217_3
; GFX90A-NEXT: s_branch .LBB217_4
; GFX90A-NEXT: .LBB217_2:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB217_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1]
; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB217_4: ; %atomicrmw.end
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_udec_wrap_i64_saddr_ret_a_a:
@@ -16642,40 +16515,37 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: s_cbranch_vccz .LBB217_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB217_3
; GFX950-NEXT: s_branch .LBB217_4
; GFX950-NEXT: .LBB217_2:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB217_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s2, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s2
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1]
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s2
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s2
; GFX950-NEXT: .LBB217_4: ; %atomicrmw.end
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -16806,8 +16676,6 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -16816,7 +16684,7 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB219_6
; GFX90A-NEXT: .LBB219_4:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_cbranch_execz .LBB219_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
@@ -16829,13 +16697,13 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB219_6: ; %atomicrmw.phi
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -16874,8 +16742,6 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -16884,7 +16750,7 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB219_6
; GFX950-NEXT: .LBB219_4:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_cbranch_execz .LBB219_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
@@ -16895,12 +16761,13 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB219_6: ; %atomicrmw.phi
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -17071,8 +16938,6 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -17081,7 +16946,7 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB221_6
; GFX90A-NEXT: .LBB221_4:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_cbranch_execz .LBB221_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
@@ -17094,13 +16959,13 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB221_6: ; %atomicrmw.phi
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -17139,8 +17004,6 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -17149,7 +17012,7 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB221_6
; GFX950-NEXT: .LBB221_4:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_cbranch_execz .LBB221_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
@@ -17160,12 +17023,13 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB221_6: ; %atomicrmw.phi
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -17333,38 +17197,36 @@ define void @flat_atomic_fadd_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v0, s[4:5] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
; GFX90A-NEXT: s_cbranch_execz .LBB223_5
; GFX90A-NEXT: s_branch .LBB223_6
; GFX90A-NEXT: .LBB223_3:
-; GFX90A-NEXT: ; implicit-def: $agpr0
+; GFX90A-NEXT: ; implicit-def: $vgpr1
; GFX90A-NEXT: s_branch .LBB223_7
; GFX90A-NEXT: .LBB223_4:
-; GFX90A-NEXT: ; implicit-def: $agpr0
+; GFX90A-NEXT: ; implicit-def: $vgpr1
; GFX90A-NEXT: .LBB223_5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s6, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v1, s6
-; GFX90A-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v2, s6
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f32_e32 v3, v2, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX90A-NEXT: v_add_f32_e32 v3, v1, v0
+; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
; GFX90A-NEXT: .LBB223_6: ; %Flow1
; GFX90A-NEXT: s_cbranch_execnz .LBB223_8
; GFX90A-NEXT: .LBB223_7: ; %atomicrmw.shared
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s4
-; GFX90A-NEXT: ds_add_rtn_f32 v0, v1, v0
+; GFX90A-NEXT: ds_add_rtn_f32 v1, v1, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: .LBB223_8: ; %atomicrmw.end
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_f32_saddr_ret_a_a:
@@ -17485,12 +17347,12 @@ define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB225_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -17514,12 +17376,12 @@ define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB225_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -17597,29 +17459,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB227_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
+; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB227_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -17629,29 +17491,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_max_f32_e32 v4, v0, v0
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB227_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX950-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
+; GFX950-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX950-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB227_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -17735,29 +17597,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB229_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
+; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB229_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -17767,29 +17629,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_max_f32_e32 v4, v0, v0
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB229_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX950-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
+; GFX950-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX950-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB229_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -17873,29 +17735,30 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB231_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4
-; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
+; GFX90A-NEXT: v_max_f32_e32 v0, v1, v4
+; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB231_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -17919,12 +17782,12 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB231_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -18005,29 +17868,30 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB233_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4
-; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
+; GFX90A-NEXT: v_min_f32_e32 v0, v1, v4
+; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB233_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -18051,12 +17915,12 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB233_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -18161,27 +18025,21 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[4:5] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB235_5
; GFX90A-NEXT: s_branch .LBB235_6
; GFX90A-NEXT: .LBB235_3:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_branch .LBB235_7
; GFX90A-NEXT: .LBB235_4:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: .LBB235_5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s6, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s6
; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB235_6: ; %Flow1
@@ -18189,16 +18047,17 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: .LBB235_7: ; %atomicrmw.shared
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1]
+; GFX90A-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: .LBB235_8: ; %atomicrmw.end
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_f64_saddr_ret_a_a:
@@ -18225,40 +18084,36 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b32_e32 v2, 0
; GFX950-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB235_5
; GFX950-NEXT: s_branch .LBB235_6
; GFX950-NEXT: .LBB235_3:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: s_branch .LBB235_7
; GFX950-NEXT: .LBB235_4:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB235_5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s2, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[4:5], s2
; GFX950-NEXT: .LBB235_6: ; %Flow1
; GFX950-NEXT: s_cbranch_execnz .LBB235_8
; GFX950-NEXT: .LBB235_7: ; %atomicrmw.shared
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v2, s0
-; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1]
+; GFX950-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1]
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: .LBB235_8: ; %atomicrmw.end
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
@@ -18408,9 +18263,7 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB237_2
@@ -18418,7 +18271,7 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB237_6
; GFX90A-NEXT: .LBB237_4:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_cbranch_execz .LBB237_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
@@ -18426,14 +18279,13 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB237_6: ; %atomicrmw.phi
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -18466,9 +18318,7 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB237_2
@@ -18476,18 +18326,18 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB237_6
; GFX950-NEXT: .LBB237_4:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_cbranch_execz .LBB237_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB237_6: ; %atomicrmw.phi
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -18618,41 +18468,37 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: s_cbranch_vccz .LBB239_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] glc
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB239_3
; GFX90A-NEXT: s_branch .LBB239_4
; GFX90A-NEXT: .LBB239_2:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB239_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
-; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB239_4: ; %atomicrmw.end
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_f64_saddr_ret_a_a:
@@ -18674,13 +18520,10 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB239_3
; GFX950-NEXT: s_branch .LBB239_4
; GFX950-NEXT: .LBB239_2:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB239_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
@@ -18688,15 +18531,15 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB239_4: ; %atomicrmw.end
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
@@ -18801,41 +18644,37 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: s_cbranch_vccz .LBB241_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] glc
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB241_3
; GFX90A-NEXT: s_branch .LBB241_4
; GFX90A-NEXT: .LBB241_2:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB241_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
-; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
-; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB241_4: ; %atomicrmw.end
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_f64_saddr_ret_a_a:
@@ -18857,13 +18696,10 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB241_3
; GFX950-NEXT: s_branch .LBB241_4
; GFX950-NEXT: .LBB241_2:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: .LBB241_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
@@ -18871,15 +18707,15 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB241_4: ; %atomicrmw.end
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
@@ -19003,8 +18839,6 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -19013,7 +18847,7 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB243_6
; GFX90A-NEXT: .LBB243_4:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_cbranch_execz .LBB243_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
@@ -19022,17 +18856,16 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB243_6: ; %atomicrmw.phi
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -19070,8 +18903,6 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -19080,7 +18911,7 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB243_6
; GFX950-NEXT: .LBB243_4:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_cbranch_execz .LBB243_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
@@ -19090,12 +18921,13 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB243_6: ; %atomicrmw.phi
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -19263,8 +19095,6 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -19273,7 +19103,7 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB245_6
; GFX90A-NEXT: .LBB245_4:
-; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_cbranch_execz .LBB245_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
@@ -19282,17 +19112,16 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
-; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB245_6: ; %atomicrmw.phi
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -19330,8 +19159,6 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -19340,7 +19167,7 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB245_6
; GFX950-NEXT: .LBB245_4:
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_cbranch_execz .LBB245_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
@@ -19350,12 +19177,13 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB245_6: ; %atomicrmw.phi
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -19515,12 +19343,12 @@ define void @flat_atomic_fadd_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB247_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -19618,12 +19446,12 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB249_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -19647,12 +19475,12 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB249_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -19730,29 +19558,29 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB251_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT: v_pk_max_f16 v0, v0, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB251_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -19762,30 +19590,30 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB251_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX950-NEXT: v_pk_max_f16 v0, v1, v1
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_max_f16 v2, v2, v4
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
+; GFX950-NEXT: v_pk_max_f16 v0, v0, v4
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB251_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -19870,29 +19698,29 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB253_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
-; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX90A-NEXT: v_pk_min_f16 v0, v0, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB253_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -19902,30 +19730,30 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: .LBB253_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX950-NEXT: v_pk_max_f16 v0, v1, v1
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_min_f16 v2, v2, v4
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
+; GFX950-NEXT: v_pk_min_f16 v0, v0, v4
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB253_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -20031,13 +19859,13 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB255_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20061,12 +19889,12 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB255_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -20172,13 +20000,13 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB257_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20202,12 +20030,12 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB257_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -20328,13 +20156,13 @@ define void @flat_atomic_fadd_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB259_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20469,13 +20297,13 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB261_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20505,13 +20333,13 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB261_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -20647,13 +20475,13 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB263_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20683,13 +20511,13 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB263_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -20825,13 +20653,13 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB265_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20861,13 +20689,13 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB265_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -21008,13 +20836,13 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB267_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -21044,13 +20872,13 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB267_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -21196,13 +21024,13 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB269_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -21232,13 +21060,13 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB269_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll
index 063feec759efa..37a44d8b4b7d1 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll
@@ -449,13 +449,13 @@ define void @global_atomic_cmpxchg_i64_ret_a_a__a(ptr addrspace(1) %ptr) #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
-; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ; def a[2:3]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
-; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a3
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
; CHECK-NEXT: buffer_wbl2
; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -483,13 +483,13 @@ define void @global_atomic_cmpxchg_i64_ret_a_a__v(ptr addrspace(1) %ptr) #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
-; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ; def a[2:3]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
-; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a3
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
; CHECK-NEXT: buffer_wbl2
; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -515,8 +515,8 @@ define void @global_atomic_cmpxchg_i64_ret_v_a__v(ptr addrspace(1) %ptr) #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[4:5]
; CHECK-NEXT: ;;#ASMEND
@@ -545,8 +545,8 @@ define void @global_atomic_cmpxchg_i64_ret_a_v__v(ptr addrspace(1) %ptr) #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[2:3]
; CHECK-NEXT: ;;#ASMEND
@@ -661,8 +661,8 @@ define void @global_atomic_cmpxchg_i64_ret_av_a__av(ptr addrspace(1) %ptr) #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[4:5]
; CHECK-NEXT: ;;#ASMEND
@@ -691,8 +691,8 @@ define void @global_atomic_cmpxchg_i64_ret_a_av__av(ptr addrspace(1) %ptr) #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[2:3]
; CHECK-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
index c98fff96d7b8a..c54421ae64528 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
@@ -338,225 +338,264 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def a2
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a33, v31
+; GFX90A-NEXT: v_accvgpr_write_b32 a32, v30
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v29
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v28
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v27
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v26
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v25
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v23
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v22
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v21
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v20
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v19
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v18
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v17
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v16
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v15
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a34
+; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a34
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a4
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a5
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a6
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a7
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a8
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a9
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a10
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a11
+; GFX90A-NEXT: v_accvgpr_read_b32 v10, a12
+; GFX90A-NEXT: v_accvgpr_read_b32 v11, a13
+; GFX90A-NEXT: v_accvgpr_read_b32 v12, a14
+; GFX90A-NEXT: v_accvgpr_read_b32 v13, a15
+; GFX90A-NEXT: v_accvgpr_read_b32 v14, a16
+; GFX90A-NEXT: v_accvgpr_read_b32 v15, a17
+; GFX90A-NEXT: v_accvgpr_read_b32 v16, a18
+; GFX90A-NEXT: v_accvgpr_read_b32 v17, a19
+; GFX90A-NEXT: v_accvgpr_read_b32 v18, a20
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a21
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a22
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a23
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a24
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a25
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a27
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a28
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a29
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a30
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a31
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a32
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a33
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use a0
+; GFX90A-NEXT: ; use v[0:31]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[0:31]
+; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_xchg_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill
-; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill
-; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def a2
+; GFX950-NEXT: ; def a34
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_accvgpr_write_b32 a33, v31
+; GFX950-NEXT: v_accvgpr_write_b32 a32, v30
+; GFX950-NEXT: v_accvgpr_write_b32 a31, v29
+; GFX950-NEXT: v_accvgpr_write_b32 a30, v28
+; GFX950-NEXT: v_accvgpr_write_b32 a29, v27
+; GFX950-NEXT: v_accvgpr_write_b32 a28, v26
+; GFX950-NEXT: v_accvgpr_write_b32 a27, v25
+; GFX950-NEXT: v_accvgpr_write_b32 a26, v24
+; GFX950-NEXT: v_accvgpr_write_b32 a25, v23
+; GFX950-NEXT: v_accvgpr_write_b32 a24, v22
+; GFX950-NEXT: v_accvgpr_write_b32 a23, v21
+; GFX950-NEXT: v_accvgpr_write_b32 a22, v20
+; GFX950-NEXT: v_accvgpr_write_b32 a21, v19
+; GFX950-NEXT: v_accvgpr_write_b32 a20, v18
+; GFX950-NEXT: v_accvgpr_write_b32 a19, v17
+; GFX950-NEXT: v_accvgpr_write_b32 a18, v16
+; GFX950-NEXT: v_accvgpr_write_b32 a17, v15
+; GFX950-NEXT: v_accvgpr_write_b32 a16, v14
+; GFX950-NEXT: v_accvgpr_write_b32 a15, v13
+; GFX950-NEXT: v_accvgpr_write_b32 a14, v12
+; GFX950-NEXT: v_accvgpr_write_b32 a13, v11
+; GFX950-NEXT: v_accvgpr_write_b32 a12, v10
+; GFX950-NEXT: v_accvgpr_write_b32 a11, v9
+; GFX950-NEXT: v_accvgpr_write_b32 a10, v8
+; GFX950-NEXT: v_accvgpr_write_b32 a9, v7
+; GFX950-NEXT: v_accvgpr_write_b32 a8, v6
+; GFX950-NEXT: v_accvgpr_write_b32 a7, v5
+; GFX950-NEXT: v_accvgpr_write_b32 a6, v4
+; GFX950-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a3, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a2, v0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a34
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload
-; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload
-; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload
-; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload
-; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
-; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload
-; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a2
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a3
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a4
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a5
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a6
+; GFX950-NEXT: v_accvgpr_read_b32 v5, a7
+; GFX950-NEXT: v_accvgpr_read_b32 v6, a8
+; GFX950-NEXT: v_accvgpr_read_b32 v7, a9
+; GFX950-NEXT: v_accvgpr_read_b32 v8, a10
+; GFX950-NEXT: v_accvgpr_read_b32 v9, a11
+; GFX950-NEXT: v_accvgpr_read_b32 v10, a12
+; GFX950-NEXT: v_accvgpr_read_b32 v11, a13
+; GFX950-NEXT: v_accvgpr_read_b32 v12, a14
+; GFX950-NEXT: v_accvgpr_read_b32 v13, a15
+; GFX950-NEXT: v_accvgpr_read_b32 v14, a16
+; GFX950-NEXT: v_accvgpr_read_b32 v15, a17
+; GFX950-NEXT: v_accvgpr_read_b32 v16, a18
+; GFX950-NEXT: v_accvgpr_read_b32 v17, a19
+; GFX950-NEXT: v_accvgpr_read_b32 v18, a20
+; GFX950-NEXT: v_accvgpr_read_b32 v19, a21
+; GFX950-NEXT: v_accvgpr_read_b32 v20, a22
+; GFX950-NEXT: v_accvgpr_read_b32 v21, a23
+; GFX950-NEXT: v_accvgpr_read_b32 v22, a24
+; GFX950-NEXT: v_accvgpr_read_b32 v23, a25
+; GFX950-NEXT: v_accvgpr_read_b32 v24, a26
+; GFX950-NEXT: v_accvgpr_read_b32 v25, a27
+; GFX950-NEXT: v_accvgpr_read_b32 v26, a28
+; GFX950-NEXT: v_accvgpr_read_b32 v27, a29
+; GFX950-NEXT: v_accvgpr_read_b32 v28, a30
+; GFX950-NEXT: v_accvgpr_read_b32 v29, a31
+; GFX950-NEXT: v_accvgpr_read_b32 v30, a32
+; GFX950-NEXT: v_accvgpr_read_b32 v31, a33
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use a0
+; GFX950-NEXT: ; use v[0:31]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[0:31]
+; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
@@ -1062,12 +1101,12 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1092,12 +1131,12 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB21_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -1197,12 +1236,12 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1226,12 +1265,12 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB23_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -1394,12 +1433,12 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1423,12 +1462,12 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB26_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -2007,14 +2046,14 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB32_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -2040,14 +2079,14 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB32_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -2151,14 +2190,14 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB34_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -2182,14 +2221,14 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB34_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -2356,14 +2395,14 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB37_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -2387,14 +2426,14 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB37_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -2986,223 +3025,262 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def a2
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a33, v31
+; GFX90A-NEXT: v_accvgpr_write_b32 a32, v30
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v29
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v28
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v27
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v26
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v25
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v23
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v22
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v21
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v20
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v19
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v18
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v17
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v16
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v15
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a34
+; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a34
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: global_atomic_xor v0, v[0:1], v2, off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a4
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a5
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a6
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a7
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a8
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a9
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a10
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a11
+; GFX90A-NEXT: v_accvgpr_read_b32 v10, a12
+; GFX90A-NEXT: v_accvgpr_read_b32 v11, a13
+; GFX90A-NEXT: v_accvgpr_read_b32 v12, a14
+; GFX90A-NEXT: v_accvgpr_read_b32 v13, a15
+; GFX90A-NEXT: v_accvgpr_read_b32 v14, a16
+; GFX90A-NEXT: v_accvgpr_read_b32 v15, a17
+; GFX90A-NEXT: v_accvgpr_read_b32 v16, a18
+; GFX90A-NEXT: v_accvgpr_read_b32 v17, a19
+; GFX90A-NEXT: v_accvgpr_read_b32 v18, a20
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a21
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a22
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a23
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a24
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a25
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a27
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a28
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a29
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a30
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a31
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a32
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a33
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use a0
+; GFX90A-NEXT: ; use v[0:31]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[0:31]
+; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
-; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_xor_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill
+; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill
-; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill
-; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def a2
+; GFX950-NEXT: ; def a34
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_accvgpr_write_b32 a33, v31
+; GFX950-NEXT: v_accvgpr_write_b32 a32, v30
+; GFX950-NEXT: v_accvgpr_write_b32 a31, v29
+; GFX950-NEXT: v_accvgpr_write_b32 a30, v28
+; GFX950-NEXT: v_accvgpr_write_b32 a29, v27
+; GFX950-NEXT: v_accvgpr_write_b32 a28, v26
+; GFX950-NEXT: v_accvgpr_write_b32 a27, v25
+; GFX950-NEXT: v_accvgpr_write_b32 a26, v24
+; GFX950-NEXT: v_accvgpr_write_b32 a25, v23
+; GFX950-NEXT: v_accvgpr_write_b32 a24, v22
+; GFX950-NEXT: v_accvgpr_write_b32 a23, v21
+; GFX950-NEXT: v_accvgpr_write_b32 a22, v20
+; GFX950-NEXT: v_accvgpr_write_b32 a21, v19
+; GFX950-NEXT: v_accvgpr_write_b32 a20, v18
+; GFX950-NEXT: v_accvgpr_write_b32 a19, v17
+; GFX950-NEXT: v_accvgpr_write_b32 a18, v16
+; GFX950-NEXT: v_accvgpr_write_b32 a17, v15
+; GFX950-NEXT: v_accvgpr_write_b32 a16, v14
+; GFX950-NEXT: v_accvgpr_write_b32 a15, v13
+; GFX950-NEXT: v_accvgpr_write_b32 a14, v12
+; GFX950-NEXT: v_accvgpr_write_b32 a13, v11
+; GFX950-NEXT: v_accvgpr_write_b32 a12, v10
+; GFX950-NEXT: v_accvgpr_write_b32 a11, v9
+; GFX950-NEXT: v_accvgpr_write_b32 a10, v8
+; GFX950-NEXT: v_accvgpr_write_b32 a9, v7
+; GFX950-NEXT: v_accvgpr_write_b32 a8, v6
+; GFX950-NEXT: v_accvgpr_write_b32 a7, v5
+; GFX950-NEXT: v_accvgpr_write_b32 a6, v4
+; GFX950-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a3, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a2, v0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a34
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc1
-; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload
-; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload
-; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload
-; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload
-; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
-; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload
-; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a2
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a3
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a4
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a5
+; GFX950-NEXT: v_accvgpr_read_b32 v4, a6
+; GFX950-NEXT: v_accvgpr_read_b32 v5, a7
+; GFX950-NEXT: v_accvgpr_read_b32 v6, a8
+; GFX950-NEXT: v_accvgpr_read_b32 v7, a9
+; GFX950-NEXT: v_accvgpr_read_b32 v8, a10
+; GFX950-NEXT: v_accvgpr_read_b32 v9, a11
+; GFX950-NEXT: v_accvgpr_read_b32 v10, a12
+; GFX950-NEXT: v_accvgpr_read_b32 v11, a13
+; GFX950-NEXT: v_accvgpr_read_b32 v12, a14
+; GFX950-NEXT: v_accvgpr_read_b32 v13, a15
+; GFX950-NEXT: v_accvgpr_read_b32 v14, a16
+; GFX950-NEXT: v_accvgpr_read_b32 v15, a17
+; GFX950-NEXT: v_accvgpr_read_b32 v16, a18
+; GFX950-NEXT: v_accvgpr_read_b32 v17, a19
+; GFX950-NEXT: v_accvgpr_read_b32 v18, a20
+; GFX950-NEXT: v_accvgpr_read_b32 v19, a21
+; GFX950-NEXT: v_accvgpr_read_b32 v20, a22
+; GFX950-NEXT: v_accvgpr_read_b32 v21, a23
+; GFX950-NEXT: v_accvgpr_read_b32 v22, a24
+; GFX950-NEXT: v_accvgpr_read_b32 v23, a25
+; GFX950-NEXT: v_accvgpr_read_b32 v24, a26
+; GFX950-NEXT: v_accvgpr_read_b32 v25, a27
+; GFX950-NEXT: v_accvgpr_read_b32 v26, a28
+; GFX950-NEXT: v_accvgpr_read_b32 v27, a29
+; GFX950-NEXT: v_accvgpr_read_b32 v28, a30
+; GFX950-NEXT: v_accvgpr_read_b32 v29, a31
+; GFX950-NEXT: v_accvgpr_read_b32 v30, a32
+; GFX950-NEXT: v_accvgpr_read_b32 v31, a33
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use a0
+; GFX950-NEXT: ; use v[0:31]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[0:31]
+; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
@@ -3893,13 +3971,13 @@ define void @global_atomic_nand_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB69_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -3922,12 +4000,12 @@ define void @global_atomic_nand_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB69_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -4515,13 +4593,13 @@ define void @global_atomic_usub_cond_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB85_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -4546,13 +4624,13 @@ define void @global_atomic_usub_cond_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB85_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -4646,12 +4724,12 @@ define void @global_atomic_usub_sat_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB87_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -4674,12 +4752,12 @@ define void @global_atomic_usub_sat_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB87_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -5000,14 +5078,14 @@ define void @global_atomic_nand_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB95_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -5033,14 +5111,14 @@ define void @global_atomic_nand_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB95_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -5664,14 +5742,14 @@ define void @global_atomic_usub_cond_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB111_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -5700,14 +5778,14 @@ define void @global_atomic_usub_cond_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB111_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -5810,14 +5888,14 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB113_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -5846,14 +5924,14 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB113_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -6027,12 +6105,12 @@ define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB117_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -6055,12 +6133,12 @@ define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB117_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6150,13 +6228,13 @@ define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB119_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -6180,13 +6258,13 @@ define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB119_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6280,13 +6358,13 @@ define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB121_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -6310,13 +6388,13 @@ define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB121_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6411,13 +6489,13 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB123_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -6440,12 +6518,12 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB123_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6539,13 +6617,13 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB125_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -6568,12 +6646,12 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB125_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6744,14 +6822,14 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB129_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -6774,14 +6852,14 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB129_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -7024,14 +7102,14 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB135_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -7059,14 +7137,14 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB135_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -7168,14 +7246,14 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB137_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -7203,14 +7281,14 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB137_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -7383,12 +7461,12 @@ define void @global_atomic_fsub_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB141_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -7411,12 +7489,12 @@ define void @global_atomic_fsub_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB141_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -7506,13 +7584,13 @@ define void @global_atomic_fmax_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB143_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -7537,13 +7615,13 @@ define void @global_atomic_fmax_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB143_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -7638,13 +7716,13 @@ define void @global_atomic_fmin_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB145_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -7669,13 +7747,13 @@ define void @global_atomic_fmin_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB145_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -7775,13 +7853,13 @@ define void @global_atomic_fmaximum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB147_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -7804,12 +7882,12 @@ define void @global_atomic_fmaximum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB147_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -7911,13 +7989,13 @@ define void @global_atomic_fminimum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB149_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -7940,12 +8018,12 @@ define void @global_atomic_fminimum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB149_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -8062,13 +8140,13 @@ define void @global_atomic_fadd_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB151_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8195,13 +8273,13 @@ define void @global_atomic_fsub_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB153_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8229,13 +8307,13 @@ define void @global_atomic_fsub_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB153_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -8365,13 +8443,13 @@ define void @global_atomic_fmax_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB155_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8399,13 +8477,13 @@ define void @global_atomic_fmax_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB155_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -8535,13 +8613,13 @@ define void @global_atomic_fmin_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB157_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8569,13 +8647,13 @@ define void @global_atomic_fmin_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB157_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -8710,13 +8788,13 @@ define void @global_atomic_fmaximum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB159_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8744,13 +8822,13 @@ define void @global_atomic_fmaximum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB159_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -8890,13 +8968,13 @@ define void @global_atomic_fminimum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB161_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8924,13 +9002,13 @@ define void @global_atomic_fminimum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB161_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -9353,13 +9431,13 @@ define void @global_atomic_nand_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB171_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9383,12 +9461,12 @@ define void @global_atomic_nand_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB171_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -10082,13 +10160,13 @@ define void @global_atomic_usub_cond_i32_saddr_ret_a_a(ptr addrspace(1) inreg %p
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB189_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -10114,13 +10192,13 @@ define void @global_atomic_usub_cond_i32_saddr_ret_a_a(ptr addrspace(1) inreg %p
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB189_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -10217,12 +10295,12 @@ define void @global_atomic_usub_sat_i32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB191_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -10246,12 +10324,12 @@ define void @global_atomic_usub_sat_i32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB191_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -10662,14 +10740,14 @@ define void @global_atomic_nand_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB201_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -10696,14 +10774,14 @@ define void @global_atomic_nand_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB201_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -11429,14 +11507,14 @@ define void @global_atomic_usub_cond_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB219_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -11466,14 +11544,14 @@ define void @global_atomic_usub_cond_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB219_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -11579,14 +11657,14 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB221_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -11616,14 +11694,14 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB221_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -11804,12 +11882,12 @@ define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB225_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11833,12 +11911,12 @@ define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB225_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -11931,13 +12009,13 @@ define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB227_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11962,13 +12040,13 @@ define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB227_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12065,13 +12143,13 @@ define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB229_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12096,13 +12174,13 @@ define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB229_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12200,13 +12278,13 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB231_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12230,12 +12308,12 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB231_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12332,13 +12410,13 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB233_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12362,12 +12440,12 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB233_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12544,14 +12622,14 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB237_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -12575,14 +12653,14 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB237_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -12834,14 +12912,14 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB243_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -12870,14 +12948,14 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB243_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -12982,14 +13060,14 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB245_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -13018,14 +13096,14 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB245_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -13205,12 +13283,12 @@ define void @global_atomic_fsub_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB249_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13234,12 +13312,12 @@ define void @global_atomic_fsub_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB249_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13332,13 +13410,13 @@ define void @global_atomic_fmax_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB251_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13364,13 +13442,13 @@ define void @global_atomic_fmax_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB251_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13468,13 +13546,13 @@ define void @global_atomic_fmin_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB253_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13500,13 +13578,13 @@ define void @global_atomic_fmin_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB253_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13609,13 +13687,13 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB255_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13639,12 +13717,12 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB255_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13749,13 +13827,13 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB257_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13779,12 +13857,12 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB257_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13904,13 +13982,13 @@ define void @global_atomic_fadd_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB259_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14041,13 +14119,13 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB261_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14076,13 +14154,13 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB261_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14215,13 +14293,13 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB263_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14250,13 +14328,13 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB263_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14389,13 +14467,13 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB265_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14424,13 +14502,13 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB265_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14568,13 +14646,13 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB267_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14603,13 +14681,13 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB267_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14752,13 +14830,13 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB269_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14787,13 +14865,13 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB269_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 9e240238c1066..ebbeab94066d6 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -146,9 +146,9 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v39, a2
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v32
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v39
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use a3 v[0:31]
; GFX908-NEXT: ;;#ASMEND
@@ -437,9 +437,9 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 {
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_nop 7
-; GFX908-NEXT: v_accvgpr_read_b32 v33, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a2
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v33
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v35
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use a3 v[0:31]
; GFX908-NEXT: ;;#ASMEND
@@ -1045,9 +1045,9 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v39, a2
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v32
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v39
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use a3 v[0:31]
; GFX908-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
index 63b7b70548baf..0c5fd1fc0932a 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
@@ -180,55 +180,63 @@ define amdgpu_kernel void @test_call_empty() #0 {
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def a[0:31]
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v30, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v34, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v33, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a28
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_endpgm
bb:
@@ -313,57 +321,65 @@ define amdgpu_kernel void @test_call_areg4() #0 {
; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23]
; GFX908-NEXT: s_mov_b32 s32, 0
; GFX908-NEXT: ;;#ASMSTART
-; GFX908-NEXT: ; def a[0:31]
+; GFX908-NEXT: ; def a[4:35]
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v30, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v34, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v33, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a28
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a32
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a33
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a34
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a35
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_endpgm
bb:
@@ -448,57 +464,65 @@ define amdgpu_kernel void @test_call_areg32() #0 {
; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23]
; GFX908-NEXT: s_mov_b32 s32, 0
; GFX908-NEXT: ;;#ASMSTART
-; GFX908-NEXT: ; def a[0:31]
+; GFX908-NEXT: ; def a[32:63]
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v30, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v34, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v33, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a28
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a60
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a61
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a62
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a63
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a56
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a57
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a58
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a59
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a52
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a53
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a54
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a55
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a48
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a49
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a50
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a51
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a44
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a45
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a46
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a47
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a40
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a41
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a42
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a43
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a36
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a37
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a38
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a39
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a32
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a33
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a34
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a35
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_endpgm
bb:
@@ -585,55 +609,63 @@ define amdgpu_kernel void @test_call_areg64() #0 {
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def a[0:31]
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v30, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v34, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v33, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a28
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_endpgm
bb:
@@ -718,57 +750,65 @@ define amdgpu_kernel void @test_call_areg31_63() #0 {
; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23]
; GFX908-NEXT: s_mov_b32 s32, 0
; GFX908-NEXT: ;;#ASMSTART
-; GFX908-NEXT: ; def a[0:31]
+; GFX908-NEXT: ; def a[64:95]
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v30, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v34, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v33, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a28
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a92
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a93
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a94
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a95
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a88
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a89
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a90
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a91
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a84
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a85
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a86
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a87
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a80
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a81
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a82
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a83
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a76
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a77
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a78
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a79
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a72
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a73
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a74
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a75
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a68
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a69
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a70
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a71
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a64
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a65
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a66
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a67
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_endpgm
bb:
@@ -849,61 +889,125 @@ define amdgpu_kernel void @test_call_unknown() #0 {
; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX908-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2
-; GFX908-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX908-NEXT: s_mov_b32 s32, 0
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def a[0:31]
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v43, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v42, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v41, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v40, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v47, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v46, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v45, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v44, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v59, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v58, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v57, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v56, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v63, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v62, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v61, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v60, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v75, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v74, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v73, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v72, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v79, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v78, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v77, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v76, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v91, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v90, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v89, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v88, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v95, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v94, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v93, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v92, a28
+; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX908-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX908-NEXT: s_mov_b32 s32, 0
+; GFX908-NEXT: v_accvgpr_read_b32 v95, a0 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v94, a1 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v93, a2 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v92, a3 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v91, a4 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v90, a5 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v89, a6 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v88, a7 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v79, a8 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v78, a9 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v77, a10 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v76, a11 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v75, a12 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v74, a13 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v73, a14 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v72, a15 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v63, a16 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v61, a18 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v60, a19 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v59, a20 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v58, a21 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v57, a22 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v56, a23 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v47, a24 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v46, a25 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v45, a26 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v44, a27 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v43, a28 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v42, a29 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v41, a30 ; Reload Reuse
+; GFX908-NEXT: v_accvgpr_read_b32 v40, a31 ; Reload Reuse
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[92:95], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[88:91], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[76:79], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[72:75], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[60:63], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[56:59], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[44:47], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[40:43], off
+; GFX908-NEXT: v_mov_b32_e32 v4, v95
+; GFX908-NEXT: v_mov_b32_e32 v5, v94
+; GFX908-NEXT: v_mov_b32_e32 v6, v93
+; GFX908-NEXT: v_mov_b32_e32 v7, v92
+; GFX908-NEXT: v_mov_b32_e32 v8, v91
+; GFX908-NEXT: v_mov_b32_e32 v9, v90
+; GFX908-NEXT: v_mov_b32_e32 v10, v89
+; GFX908-NEXT: v_mov_b32_e32 v11, v88
+; GFX908-NEXT: v_mov_b32_e32 v12, v79
+; GFX908-NEXT: v_mov_b32_e32 v13, v78
+; GFX908-NEXT: v_mov_b32_e32 v14, v77
+; GFX908-NEXT: v_mov_b32_e32 v15, v76
+; GFX908-NEXT: v_mov_b32_e32 v16, v75
+; GFX908-NEXT: v_mov_b32_e32 v17, v74
+; GFX908-NEXT: v_mov_b32_e32 v18, v73
+; GFX908-NEXT: v_mov_b32_e32 v19, v72
+; GFX908-NEXT: v_mov_b32_e32 v20, v63
+; GFX908-NEXT: v_mov_b32_e32 v21, v62
+; GFX908-NEXT: v_mov_b32_e32 v22, v61
+; GFX908-NEXT: v_mov_b32_e32 v23, v60
+; GFX908-NEXT: v_mov_b32_e32 v24, v59
+; GFX908-NEXT: v_mov_b32_e32 v25, v58
+; GFX908-NEXT: v_mov_b32_e32 v26, v57
+; GFX908-NEXT: v_mov_b32_e32 v27, v56
+; GFX908-NEXT: v_mov_b32_e32 v28, v47
+; GFX908-NEXT: v_mov_b32_e32 v29, v46
+; GFX908-NEXT: v_mov_b32_e32 v30, v45
+; GFX908-NEXT: v_mov_b32_e32 v31, v44
+; GFX908-NEXT: v_mov_b32_e32 v32, v43
+; GFX908-NEXT: v_mov_b32_e32 v33, v42
+; GFX908-NEXT: v_mov_b32_e32 v34, v41
+; GFX908-NEXT: v_mov_b32_e32 v35, v40
+; GFX908-NEXT: v_mov_b32_e32 v0, v32
+; GFX908-NEXT: v_mov_b32_e32 v1, v33
+; GFX908-NEXT: v_mov_b32_e32 v2, v34
+; GFX908-NEXT: v_mov_b32_e32 v3, v35
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, v28
+; GFX908-NEXT: v_mov_b32_e32 v1, v29
+; GFX908-NEXT: v_mov_b32_e32 v2, v30
+; GFX908-NEXT: v_mov_b32_e32 v3, v31
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, v24
+; GFX908-NEXT: v_mov_b32_e32 v1, v25
+; GFX908-NEXT: v_mov_b32_e32 v2, v26
+; GFX908-NEXT: v_mov_b32_e32 v3, v27
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, v20
+; GFX908-NEXT: v_mov_b32_e32 v1, v21
+; GFX908-NEXT: v_mov_b32_e32 v2, v22
+; GFX908-NEXT: v_mov_b32_e32 v3, v23
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, v16
+; GFX908-NEXT: v_mov_b32_e32 v1, v17
+; GFX908-NEXT: v_mov_b32_e32 v2, v18
+; GFX908-NEXT: v_mov_b32_e32 v3, v19
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, v12
+; GFX908-NEXT: v_mov_b32_e32 v1, v13
+; GFX908-NEXT: v_mov_b32_e32 v2, v14
+; GFX908-NEXT: v_mov_b32_e32 v3, v15
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, v8
+; GFX908-NEXT: v_mov_b32_e32 v1, v9
+; GFX908-NEXT: v_mov_b32_e32 v2, v10
+; GFX908-NEXT: v_mov_b32_e32 v3, v11
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: v_mov_b32_e32 v2, v6
+; GFX908-NEXT: v_mov_b32_e32 v3, v7
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
index 1a2dd6e5f90f6..1180fc7b35a0b 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
@@ -29,17 +29,17 @@ define void @remat_regcopy_avoids_spill(i32 %v0, i32 %v1, i32 %v2, i32 %v3, i32
; GFX908-LABEL: remat_regcopy_avoids_spill:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v4
; GFX908-NEXT: v_accvgpr_write_b32 a4, v2
-; GFX908-NEXT: v_accvgpr_write_b32 a0, v7
-; GFX908-NEXT: v_accvgpr_write_b32 a1, v8
-; GFX908-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v8
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_write_b32 a2, v4
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
-; GFX908-NEXT: v_accvgpr_write_b32 a4, v6
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v6
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index c3b14e8829042..2cbf39e2464bc 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -423,7 +423,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], 0 offen offset:1024 sc0
+; GFX942-NEXT: v_mov_b32_e32 v6, v5
+; GFX942-NEXT: buffer_atomic_add_f32 v6, v4, s[4:7], 0 offen offset:1024 sc0
; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX942-NEXT: ; implicit-def: $vgpr4
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
@@ -431,7 +432,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, v5
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
@@ -541,7 +542,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_add_f32 v5, v4, s[8:11], 0 offen offset:1024 glc
+; GFX90A-NEXT: v_mov_b32_e32 v6, v5
+; GFX90A-NEXT: buffer_atomic_add_f32 v6, v4, s[8:11], 0 offen offset:1024 glc
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -549,7 +551,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2441,8 +2443,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v6
-; GFX942-NEXT: v_mov_b32_e32 v6, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v6
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
; GFX942-NEXT: s_mov_b64 s[2:3], exec
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
@@ -2456,6 +2458,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
; GFX942-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX942-NEXT: ; implicit-def: $vgpr4
@@ -2607,8 +2610,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v6
-; GFX90A-NEXT: v_mov_b32_e32 v6, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -2620,6 +2623,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4
@@ -4485,7 +4489,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: v_lshlrev_b32_e32 v6, v4, v6
; GFX942-NEXT: v_and_or_b32 v6, v7, v11, v6
; GFX942-NEXT: s_mov_b64 s[8:9], exec
-; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
@@ -4499,6 +4502,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB15_4
@@ -4774,7 +4778,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6
; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -4786,6 +4789,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_4
@@ -6348,7 +6352,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -6361,6 +6364,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_4
@@ -6674,7 +6678,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -6686,6 +6689,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_4
@@ -7528,7 +7532,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0
+; GFX942-NEXT: v_mov_b32_e32 v6, v5
+; GFX942-NEXT: buffer_atomic_pk_add_f16 v6, v4, s[4:7], 0 offen offset:1024 sc0
; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX942-NEXT: ; implicit-def: $vgpr4
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
@@ -7536,7 +7541,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, v5
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
@@ -7682,7 +7687,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc
+; GFX90A-NEXT: v_mov_b32_e32 v6, v5
+; GFX90A-NEXT: buffer_atomic_pk_add_f16 v6, v4, s[8:11], 0 offen offset:1024 glc
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -7690,7 +7696,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -9975,7 +9981,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -9988,6 +9993,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB28_4
@@ -10301,7 +10307,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -10313,6 +10318,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_4
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index f7a1fb35c8106..187c8c9c11fa3 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -429,7 +429,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX942-NEXT: v_max_f32_e32 v4, v7, v7
; GFX942-NEXT: v_max_f32_e32 v6, v4, v9
; GFX942-NEXT: s_mov_b64 s[8:9], exec
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
@@ -443,6 +442,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB2_4
@@ -549,7 +549,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7
; GFX90A-NEXT: v_max_f32_e32 v6, v4, v9
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -561,6 +560,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_4
@@ -1653,8 +1653,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v6
-; GFX942-NEXT: v_mov_b32_e32 v6, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v6
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
; GFX942-NEXT: s_mov_b64 s[2:3], exec
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
@@ -1668,6 +1668,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
; GFX942-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX942-NEXT: ; implicit-def: $vgpr4
@@ -1783,8 +1784,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v6
-; GFX90A-NEXT: v_mov_b32_e32 v6, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -1796,6 +1797,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4
@@ -3603,7 +3605,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v8, v4
; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX942-NEXT: s_mov_b64 s[8:9], exec
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
@@ -3617,6 +3618,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB12_4
@@ -3902,7 +3904,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4
; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -3914,6 +3915,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_4
@@ -5484,7 +5486,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -5497,6 +5498,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB15_4
@@ -5810,7 +5812,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -5822,6 +5823,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_4
@@ -6876,7 +6878,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX942-NEXT: s_mov_b64 s[8:9], exec
; GFX942-NEXT: v_pk_max_f16 v6, v4, v9
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -6889,6 +6890,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_4
@@ -7068,7 +7070,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7
; GFX90A-NEXT: v_pk_max_f16 v6, v4, v9
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -7080,6 +7081,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_4
@@ -8665,7 +8667,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -8678,6 +8679,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB21_4
@@ -8991,7 +8993,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -9003,6 +9004,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_4
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 8ac6353133e72..acbea3921b616 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -429,7 +429,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX942-NEXT: v_max_f32_e32 v4, v7, v7
; GFX942-NEXT: v_min_f32_e32 v6, v4, v9
; GFX942-NEXT: s_mov_b64 s[8:9], exec
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
@@ -443,6 +442,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB2_4
@@ -549,7 +549,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7
; GFX90A-NEXT: v_min_f32_e32 v6, v4, v9
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -561,6 +560,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_4
@@ -1653,8 +1653,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v6
-; GFX942-NEXT: v_mov_b32_e32 v6, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v6
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
; GFX942-NEXT: s_mov_b64 s[2:3], exec
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
@@ -1668,6 +1668,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
; GFX942-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX942-NEXT: ; implicit-def: $vgpr4
@@ -1783,8 +1784,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v6
-; GFX90A-NEXT: v_mov_b32_e32 v6, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -1796,6 +1797,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4
@@ -3603,7 +3605,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v8, v4
; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX942-NEXT: s_mov_b64 s[8:9], exec
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
@@ -3617,6 +3618,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB12_4
@@ -3902,7 +3904,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4
; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -3914,6 +3915,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_4
@@ -5484,7 +5486,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -5497,6 +5498,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB15_4
@@ -5810,7 +5812,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -5822,6 +5823,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_4
@@ -6876,7 +6878,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX942-NEXT: s_mov_b64 s[8:9], exec
; GFX942-NEXT: v_pk_min_f16 v6, v4, v9
; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -6889,6 +6890,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_4
@@ -7068,7 +7070,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7
; GFX90A-NEXT: v_pk_min_f16 v6, v4, v9
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -7080,6 +7081,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_4
@@ -8665,7 +8667,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -8678,6 +8679,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB21_4
@@ -8991,7 +8993,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -9003,6 +9004,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_4
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index 3c991cfb7a1aa..0199e2866b35d 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -258,68 +258,59 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
; SDAG-GFX942-NEXT: .LBB0_1: ; %load-store-loop
; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16
-; SDAG-GFX942-NEXT: v_mov_b32_e32 v60, s1
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32
-; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16
-; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[4:7], 0 offen
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v0, s[4:7], 0 offen offset:16
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v0, s[4:7], 0 offen offset:32
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v0, s[4:7], 0 offen offset:48
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v0, s[4:7], 0 offen offset:64
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v0, s[4:7], 0 offen offset:80
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v0, s[4:7], 0 offen offset:96
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v0, s[4:7], 0 offen offset:112
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v0, s[4:7], 0 offen offset:128
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v0, s[4:7], 0 offen offset:144
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v0, s[4:7], 0 offen offset:160
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v0, s[4:7], 0 offen offset:176
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v0, s[4:7], 0 offen offset:192
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v0, s[4:7], 0 offen offset:208
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v0, s[4:7], 0 offen offset:224
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v0, s[4:7], 0 offen offset:240
+; SDAG-GFX942-NEXT: s_add_i32 s1, s8, s16
; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1
; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x2000
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v15 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v14 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v13 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v12 ; Reload Reuse
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:48
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v60, s[4:7], 0 offen offset:64
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v60, s[4:7], 0 offen offset:80
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v60, s[4:7], 0 offen offset:96
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v60, s[4:7], 0 offen offset:112
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v60, s[4:7], 0 offen offset:128
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v60, s[4:7], 0 offen offset:144
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v60, s[4:7], 0 offen offset:160
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v60, s[4:7], 0 offen offset:176
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v60, s[4:7], 0 offen offset:192
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v60, s[4:7], 0 offen offset:208
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224
-; SDAG-GFX942-NEXT: s_nop 0
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240
-; SDAG-GFX942-NEXT: s_nop 0
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16
-; SDAG-GFX942-NEXT: s_nop 1
-; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
-; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen offset:32
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v0, s[12:15], 0 offen offset:48
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v0, s[12:15], 0 offen offset:16
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v0, s[12:15], 0 offen offset:32
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v0, s[12:15], 0 offen offset:48
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v0, s[12:15], 0 offen offset:64
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v0, s[12:15], 0 offen offset:64
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v0, s[12:15], 0 offen offset:80
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v0, s[12:15], 0 offen offset:80
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v0, s[12:15], 0 offen offset:96
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v0, s[12:15], 0 offen offset:96
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v0, s[12:15], 0 offen offset:112
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v0, s[12:15], 0 offen offset:112
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v0, s[12:15], 0 offen offset:128
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v0, s[12:15], 0 offen offset:128
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v0, s[12:15], 0 offen offset:144
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v0, s[12:15], 0 offen offset:144
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v0, s[12:15], 0 offen offset:160
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v0, s[12:15], 0 offen offset:160
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v0, s[12:15], 0 offen offset:176
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v0, s[12:15], 0 offen offset:176
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v0, s[12:15], 0 offen offset:192
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v0, s[12:15], 0 offen offset:192
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v0, s[12:15], 0 offen offset:208
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v0, s[12:15], 0 offen offset:208
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v0, s[12:15], 0 offen offset:224
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v0, s[12:15], 0 offen offset:224
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v0, s[12:15], 0 offen offset:240
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v0, s[12:15], 0 offen offset:240
; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB0_1
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
; SDAG-GFX942-NEXT: s_endpgm
@@ -440,46 +431,58 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16
; GISEL-GFX942-NEXT: .LBB0_1: ; %load-store-loop
; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v62, s[8:11], 0 offen offset:96
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v62, s[8:11], 0 offen offset:112
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v62, s[8:11], 0 offen offset:128
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v62, s[8:11], 0 offen offset:144
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v62, s[8:11], 0 offen offset:160
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v62, s[8:11], 0 offen offset:176
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
-; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
+; GISEL-GFX942-NEXT: v_add_u32_e32 v2, s0, v1
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v2, s[8:11], 0 offen
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v2, s[8:11], 0 offen offset:16
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v2, s[8:11], 0 offen offset:32
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v2, s[8:11], 0 offen offset:48
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v2, s[8:11], 0 offen offset:64
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v2, s[8:11], 0 offen offset:80
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v2, s[8:11], 0 offen offset:96
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v2, s[8:11], 0 offen offset:112
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v2, s[8:11], 0 offen offset:128
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v2, s[8:11], 0 offen offset:144
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v2, s[8:11], 0 offen offset:160
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v2, s[8:11], 0 offen offset:176
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v2, s[8:11], 0 offen offset:192
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v2, s[8:11], 0 offen offset:208
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v2, s[8:11], 0 offen offset:224
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v2, s[8:11], 0 offen offset:240
+; GISEL-GFX942-NEXT: v_add_u32_e32 v2, s12, v1
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
-; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v2, s[4:7], 0 offen
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v2, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v2, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v2, s[4:7], 0 offen offset:48
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v2, s[4:7], 0 offen offset:64
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v2, s[4:7], 0 offen offset:80
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v2, s[4:7], 0 offen offset:96
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v2, s[4:7], 0 offen offset:112
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v2, s[4:7], 0 offen offset:128
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v2, s[4:7], 0 offen offset:144
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v2, s[4:7], 0 offen offset:160
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v2, s[4:7], 0 offen offset:176
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v2, s[4:7], 0 offen offset:192
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v2, s[4:7], 0 offen offset:208
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v2, s[4:7], 0 offen offset:224
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v2, s[4:7], 0 offen offset:240
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB0_1
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
; GISEL-GFX942-NEXT: s_endpgm
@@ -820,30 +823,41 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224
; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240
-; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0
+; SDAG-GFX942-NEXT: v_add_u32_e32 v1, s8, v0
; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
-; SDAG-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[12:15], 0 offen offset:32
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[12:15], 0 offen offset:96
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[12:15], 0 offen offset:112
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[12:15], 0 offen offset:128
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[12:15], 0 offen offset:144
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[12:15], 0 offen offset:160
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[12:15], 0 offen offset:176
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224
-; SDAG-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[12:15], 0 offen
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v1, s[12:15], 0 offen offset:16
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v1, s[12:15], 0 offen offset:32
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v1, s[12:15], 0 offen offset:48
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v1, s[12:15], 0 offen offset:64
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v1, s[12:15], 0 offen offset:80
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v1, s[12:15], 0 offen offset:96
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v1, s[12:15], 0 offen offset:112
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v1, s[12:15], 0 offen offset:128
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v1, s[12:15], 0 offen offset:144
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v1, s[12:15], 0 offen offset:160
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v1, s[12:15], 0 offen offset:176
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v1, s[12:15], 0 offen offset:192
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v1, s[12:15], 0 offen offset:208
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v1, s[12:15], 0 offen offset:224
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v1, s[12:15], 0 offen offset:240
; SDAG-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
; SDAG-GFX942-NEXT: s_endpgm
@@ -977,32 +991,43 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224
; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[8:11], 0 offen offset:240
-; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
+; GISEL-GFX942-NEXT: v_add_u32_e32 v1, s12, v0
; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1
; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
-; GISEL-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[4:7], 0 offen offset:32
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[4:7], 0 offen offset:96
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[4:7], 0 offen offset:112
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[4:7], 0 offen offset:128
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[4:7], 0 offen offset:144
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[4:7], 0 offen offset:160
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[4:7], 0 offen offset:176
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224
-; GISEL-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v1, s[4:7], 0 offen offset:96
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v1, s[4:7], 0 offen offset:112
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v1, s[4:7], 0 offen offset:128
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v1, s[4:7], 0 offen offset:144
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v1, s[4:7], 0 offen offset:160
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v1, s[4:7], 0 offen offset:176
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
; GISEL-GFX942-NEXT: s_endpgm
@@ -1146,8 +1171,8 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; SDAG-GFX942-NEXT: s_mov_b32 s2, s1
; SDAG-GFX942-NEXT: s_mov_b32 s3, s12
; SDAG-GFX942-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13]
-; SDAG-GFX942-NEXT: v_mov_b32_e32 v4, s0
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen
; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x54
; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44
; SDAG-GFX942-NEXT: s_mov_b32 s5, s12
@@ -1158,12 +1183,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; SDAG-GFX942-NEXT: s_mov_b32 s2, s1
; SDAG-GFX942-NEXT: s_mov_b32 s3, s12
; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; SDAG-GFX942-NEXT: v_mov_b32_e32 v5, s0
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v1, s0
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:16
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen offset:16
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen offset:16
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen offset:16
; SDAG-GFX942-NEXT: s_endpgm
;
; SDAG-GFX1100-LABEL: memcpy_known_small:
@@ -1217,8 +1242,8 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; GISEL-GFX942-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
; GISEL-GFX942-NEXT: s_mov_b32 s6, s3
; GISEL-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
-; GISEL-GFX942-NEXT: v_mov_b32_e32 v4, s0
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen
+; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen
; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44
; GISEL-GFX942-NEXT: s_load_dword s13, s[4:5], 0x54
; GISEL-GFX942-NEXT: s_mov_b32 s4, s7
@@ -1229,12 +1254,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; GISEL-GFX942-NEXT: s_mov_b32 s6, s3
; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13]
-; GISEL-GFX942-NEXT: v_mov_b32_e32 v5, s0
+; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s0
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:16
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen offset:16
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen offset:16
; GISEL-GFX942-NEXT: s_endpgm
;
; GISEL-GFX1100-LABEL: memcpy_known_small:
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll
index 683887b0a55f3..8b998354b1f4f 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll
@@ -426,122 +426,126 @@ define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
; GCN-LABEL: ds_write2_b32_av_av_no_vgprs:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: v_accvgpr_write_b32 a0, v0
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def a1
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
-; GCN-NEXT: ; def a2
+; GCN-NEXT: ; def a34
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def v[0:31]
; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GCN-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a33, v31
+; GCN-NEXT: v_accvgpr_write_b32 a32, v30
+; GCN-NEXT: v_accvgpr_write_b32 a31, v29
+; GCN-NEXT: v_accvgpr_write_b32 a30, v28
+; GCN-NEXT: v_accvgpr_write_b32 a29, v27
+; GCN-NEXT: v_accvgpr_write_b32 a28, v26
+; GCN-NEXT: v_accvgpr_write_b32 a27, v25
+; GCN-NEXT: v_accvgpr_write_b32 a26, v24
+; GCN-NEXT: v_accvgpr_write_b32 a25, v23
+; GCN-NEXT: v_accvgpr_write_b32 a24, v22
+; GCN-NEXT: v_accvgpr_write_b32 a23, v21
+; GCN-NEXT: v_accvgpr_write_b32 a22, v20
+; GCN-NEXT: v_accvgpr_write_b32 a21, v19
+; GCN-NEXT: v_accvgpr_write_b32 a20, v18
+; GCN-NEXT: v_accvgpr_write_b32 a19, v17
+; GCN-NEXT: v_accvgpr_write_b32 a18, v16
+; GCN-NEXT: v_accvgpr_write_b32 a17, v15
+; GCN-NEXT: v_accvgpr_write_b32 a16, v14
+; GCN-NEXT: v_accvgpr_write_b32 a15, v13
+; GCN-NEXT: v_accvgpr_write_b32 a14, v12
+; GCN-NEXT: v_accvgpr_write_b32 a13, v11
+; GCN-NEXT: v_accvgpr_write_b32 a12, v10
+; GCN-NEXT: v_accvgpr_write_b32 a11, v9
+; GCN-NEXT: v_accvgpr_write_b32 a10, v8
+; GCN-NEXT: v_accvgpr_write_b32 a9, v7
+; GCN-NEXT: v_accvgpr_write_b32 a8, v6
+; GCN-NEXT: v_accvgpr_write_b32 a7, v5
+; GCN-NEXT: v_accvgpr_write_b32 a6, v4
+; GCN-NEXT: v_accvgpr_write_b32 a5, v3
+; GCN-NEXT: v_accvgpr_write_b32 a4, v2
+; GCN-NEXT: v_accvgpr_write_b32 a3, v1
+; GCN-NEXT: v_accvgpr_write_b32 a2, v0
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v2, a34
; GCN-NEXT: ds_write2_b32 v0, v1, v2 offset0:10 offset1:24
-; GCN-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
-; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GCN-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v0, a2
+; GCN-NEXT: v_accvgpr_read_b32 v1, a3
+; GCN-NEXT: v_accvgpr_read_b32 v2, a4
+; GCN-NEXT: v_accvgpr_read_b32 v3, a5
+; GCN-NEXT: v_accvgpr_read_b32 v4, a6
+; GCN-NEXT: v_accvgpr_read_b32 v5, a7
+; GCN-NEXT: v_accvgpr_read_b32 v6, a8
+; GCN-NEXT: v_accvgpr_read_b32 v7, a9
+; GCN-NEXT: v_accvgpr_read_b32 v8, a10
+; GCN-NEXT: v_accvgpr_read_b32 v9, a11
+; GCN-NEXT: v_accvgpr_read_b32 v10, a12
+; GCN-NEXT: v_accvgpr_read_b32 v11, a13
+; GCN-NEXT: v_accvgpr_read_b32 v12, a14
+; GCN-NEXT: v_accvgpr_read_b32 v13, a15
+; GCN-NEXT: v_accvgpr_read_b32 v14, a16
+; GCN-NEXT: v_accvgpr_read_b32 v15, a17
+; GCN-NEXT: v_accvgpr_read_b32 v16, a18
+; GCN-NEXT: v_accvgpr_read_b32 v17, a19
+; GCN-NEXT: v_accvgpr_read_b32 v18, a20
+; GCN-NEXT: v_accvgpr_read_b32 v19, a21
+; GCN-NEXT: v_accvgpr_read_b32 v20, a22
+; GCN-NEXT: v_accvgpr_read_b32 v21, a23
+; GCN-NEXT: v_accvgpr_read_b32 v22, a24
+; GCN-NEXT: v_accvgpr_read_b32 v23, a25
+; GCN-NEXT: v_accvgpr_read_b32 v24, a26
+; GCN-NEXT: v_accvgpr_read_b32 v25, a27
+; GCN-NEXT: v_accvgpr_read_b32 v26, a28
+; GCN-NEXT: v_accvgpr_read_b32 v27, a29
+; GCN-NEXT: v_accvgpr_read_b32 v28, a30
+; GCN-NEXT: v_accvgpr_read_b32 v29, a31
+; GCN-NEXT: v_accvgpr_read_b32 v30, a32
+; GCN-NEXT: v_accvgpr_read_b32 v31, a33
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use v[0:31]
; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 10
%gep.1 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 24
@@ -976,123 +980,133 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_accvgpr_write_b32 a1, v40 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a6, v41 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a7, v42 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a8, v43 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a9, v44 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a10, v45 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a11, v46 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a12, v47 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a13, v56 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a14, v57 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a15, v58 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a16, v59 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a17, v60 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a18, v61 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a19, v62 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a20, v63 ; Reload Reuse
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword a34, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword a35, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword a36, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword a37, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: v_accvgpr_write_b32 a0, v0
; GCN-NEXT: ;;#ASMSTART
-; GCN-NEXT: ; def a[2:3]
+; GCN-NEXT: ; def a[34:35]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
-; GCN-NEXT: ; def a[4:5]
+; GCN-NEXT: ; def a[36:37]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def v[0:31]
; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GCN-NEXT: v_accvgpr_write_b32 a21, v31 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_write_b32 a33, v31
+; GCN-NEXT: v_accvgpr_write_b32 a32, v30
+; GCN-NEXT: v_accvgpr_write_b32 a31, v29
+; GCN-NEXT: v_accvgpr_write_b32 a30, v28
+; GCN-NEXT: v_accvgpr_write_b32 a29, v27
+; GCN-NEXT: v_accvgpr_write_b32 a28, v26
+; GCN-NEXT: v_accvgpr_write_b32 a27, v25
+; GCN-NEXT: v_accvgpr_write_b32 a26, v24
+; GCN-NEXT: v_accvgpr_write_b32 a25, v23
+; GCN-NEXT: v_accvgpr_write_b32 a24, v22
+; GCN-NEXT: v_accvgpr_write_b32 a23, v21
+; GCN-NEXT: v_accvgpr_write_b32 a22, v20
+; GCN-NEXT: v_accvgpr_write_b32 a21, v19
+; GCN-NEXT: v_accvgpr_write_b32 a20, v18
+; GCN-NEXT: v_accvgpr_write_b32 a19, v17
+; GCN-NEXT: v_accvgpr_write_b32 a18, v16
+; GCN-NEXT: v_accvgpr_write_b32 a17, v15
+; GCN-NEXT: v_accvgpr_write_b32 a16, v14
+; GCN-NEXT: v_accvgpr_write_b32 a15, v13
+; GCN-NEXT: v_accvgpr_write_b32 a14, v12
+; GCN-NEXT: v_accvgpr_write_b32 a13, v11
+; GCN-NEXT: v_accvgpr_write_b32 a12, v10
+; GCN-NEXT: v_accvgpr_write_b32 a11, v9
+; GCN-NEXT: v_accvgpr_write_b32 a10, v8
+; GCN-NEXT: v_accvgpr_write_b32 a9, v7
+; GCN-NEXT: v_accvgpr_write_b32 a8, v6
+; GCN-NEXT: v_accvgpr_write_b32 a7, v5
+; GCN-NEXT: v_accvgpr_write_b32 a6, v4
+; GCN-NEXT: v_accvgpr_write_b32 a5, v3
+; GCN-NEXT: v_accvgpr_write_b32 a4, v2
+; GCN-NEXT: v_accvgpr_write_b32 a3, v1
+; GCN-NEXT: v_accvgpr_write_b32 a2, v0
+; GCN-NEXT: v_accvgpr_read_b32 v2, a34
+; GCN-NEXT: v_accvgpr_read_b32 v4, a36
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v3, a35
+; GCN-NEXT: v_accvgpr_read_b32 v5, a37
; GCN-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24
-; GCN-NEXT: v_accvgpr_write_b32 a31, v21 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a30, v22 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a29, v23 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a28, v24 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a27, v25 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a26, v26 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a25, v27 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a24, v28 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a23, v29 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_write_b32 a22, v30 ; Reload Reuse
-; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GCN-NEXT: v_accvgpr_read_b32 v21, a31 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v22, a30 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v23, a29 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v24, a28 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v25, a27 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v26, a26 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v27, a25 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v28, a24 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v29, a23 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v30, a22 ; Reload Reuse
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_accvgpr_read_b32 v31, a21 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v0, a2
+; GCN-NEXT: v_accvgpr_read_b32 v1, a3
+; GCN-NEXT: v_accvgpr_read_b32 v2, a4
+; GCN-NEXT: v_accvgpr_read_b32 v3, a5
+; GCN-NEXT: v_accvgpr_read_b32 v4, a6
+; GCN-NEXT: v_accvgpr_read_b32 v5, a7
+; GCN-NEXT: v_accvgpr_read_b32 v6, a8
+; GCN-NEXT: v_accvgpr_read_b32 v7, a9
+; GCN-NEXT: v_accvgpr_read_b32 v8, a10
+; GCN-NEXT: v_accvgpr_read_b32 v9, a11
+; GCN-NEXT: v_accvgpr_read_b32 v10, a12
+; GCN-NEXT: v_accvgpr_read_b32 v11, a13
+; GCN-NEXT: v_accvgpr_read_b32 v12, a14
+; GCN-NEXT: v_accvgpr_read_b32 v13, a15
+; GCN-NEXT: v_accvgpr_read_b32 v14, a16
+; GCN-NEXT: v_accvgpr_read_b32 v15, a17
+; GCN-NEXT: v_accvgpr_read_b32 v16, a18
+; GCN-NEXT: v_accvgpr_read_b32 v17, a19
+; GCN-NEXT: v_accvgpr_read_b32 v18, a20
+; GCN-NEXT: v_accvgpr_read_b32 v19, a21
+; GCN-NEXT: v_accvgpr_read_b32 v20, a22
+; GCN-NEXT: v_accvgpr_read_b32 v21, a23
+; GCN-NEXT: v_accvgpr_read_b32 v22, a24
+; GCN-NEXT: v_accvgpr_read_b32 v23, a25
+; GCN-NEXT: v_accvgpr_read_b32 v24, a26
+; GCN-NEXT: v_accvgpr_read_b32 v25, a27
+; GCN-NEXT: v_accvgpr_read_b32 v26, a28
+; GCN-NEXT: v_accvgpr_read_b32 v27, a29
+; GCN-NEXT: v_accvgpr_read_b32 v28, a30
+; GCN-NEXT: v_accvgpr_read_b32 v29, a31
+; GCN-NEXT: v_accvgpr_read_b32 v30, a32
+; GCN-NEXT: v_accvgpr_read_b32 v31, a33
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use v[0:31]
; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: v_accvgpr_read_b32 v63, a20 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v62, a19 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v61, a18 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v60, a17 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v59, a16 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v58, a15 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v57, a14 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v56, a13 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v47, a12 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v46, a11 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v45, a10 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v44, a9 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v43, a8 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v42, a7 ; Reload Reuse
-; GCN-NEXT: v_accvgpr_read_b32 v41, a6 ; Reload Reuse
+; GCN-NEXT: buffer_load_dword a37, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword a36, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword a35, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword a34, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GCN-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %lds, i32 0, i32 10
%gep.1 = getelementptr inbounds [512 x i64], ptr addrspace(3) %lds, i32 0, i32 24
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 1e7855ccb3642..af817c3ee4eb1 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -1012,7 +1012,6 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: buffer_inv sc1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-SDAG-NEXT: s_cbranch_execz .LBB12_2
; GFX950-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private
@@ -1045,7 +1044,6 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: buffer_inv sc1
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-GISEL-NEXT: s_cbranch_execz .LBB12_2
; GFX950-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private
@@ -1169,7 +1167,6 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: buffer_inv sc1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-SDAG-NEXT: s_cbranch_execz .LBB13_2
; GFX950-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private
@@ -1206,7 +1203,6 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: buffer_inv sc1
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-GISEL-NEXT: s_cbranch_execz .LBB13_2
; GFX950-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private
diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
index 57bfd2490f9da..d973f7b71fb6d 100644
--- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
@@ -19,11 +19,11 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add
; GFX90A-LABEL: half8:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
+; GFX90A-NEXT: global_load_dwordx4 v[2:5], v0, s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX1030-LABEL: half8:
@@ -85,11 +85,11 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add
; GFX90A-LABEL: half6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
+; GFX90A-NEXT: global_load_dwordx3 v[2:4], v0, s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX90A-NEXT: global_store_dwordx3 v0, v[2:4], s[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX1030-LABEL: half6:
diff --git a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
index 597f90c0f4e84..554d4f69ea4a2 100644
--- a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
@@ -43,8 +43,7 @@ define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v16i32() #0 {
}
; ERR: error: <unknown>:0:0: in function illegal_agpr_to_sgpr_copy_i32 void (): illegal VGPR to SGPR copy
-; GCN: v_accvgpr_read_b32 [[COPY1:v[0-9]+]], a1
-; GCN: ; illegal copy [[COPY1]] to s9
+; GCN: ; illegal copy a1 to s9
define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 {
%agpr = call i32 asm sideeffect "; def $0", "=${a1}"()
call void asm sideeffect "; use $0", "${s9}"(i32 %agpr)
@@ -52,9 +51,7 @@ define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 {
}
; ERR: error: <unknown>:0:0: in function illegal_agpr_to_sgpr_copy_v2i32 void (): illegal VGPR to SGPR copy
-; GCN-DAG: v_accvgpr_read_b32 v[[COPY1L:[0-9]+]], a0
-; GCN-DAG: v_accvgpr_read_b32 v[[COPY1H:[0-9]+]], a1
-; GCN: ; illegal copy v[[[COPY1L]]:[[COPY1H]]] to s[10:11]
+; GCN: ; illegal copy a[0:1] to s[10:11]
define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_v2i32() #1 {
%vgpr = call <2 x i32> asm sideeffect "; def $0", "=${a[0:1]}"()
call void asm sideeffect "; use $0", "${s[10:11]}"(<2 x i32> %vgpr)
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
index b91963f08681c..364d2f52777d3 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -49,10 +49,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_barrier
-; GFX90A-NEXT: ds_read_b32 v0, v0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: ds_read_b32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX90A-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: barrier_release:
@@ -72,10 +72,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: s_barrier
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-LABEL: barrier_release:
@@ -94,10 +94,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_barrier
-; GFX942-NEXT: ds_read_b32 v0, v0
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: ds_read_b32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: barrier_release:
@@ -117,10 +117,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: s_barrier
; GFX942-TGSPLIT-NEXT: buffer_inv sc0
-; GFX942-TGSPLIT-NEXT: ds_read_b32 v0, v0
-; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX10WGP-LABEL: barrier_release:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
index 3e96dfe40f745..a57b43a81205b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
@@ -37,11 +37,11 @@ entry:
define amdgpu_ps void @ds_read_b96_tr_b6(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX950-SDAG-LABEL: ds_read_b96_tr_b6:
; GFX950-SDAG: ; %bb.0: ; %entry
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v1
-; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
+; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[4:6], v0 offset:32
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: global_store_dwordx3 v[4:5], v[0:2], off
+; GFX950-SDAG-NEXT: global_store_dwordx3 v[2:3], v[4:6], off
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: ds_read_b96_tr_b6:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 7959cee49b93f..fb32a83f3cf3c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -294,17 +294,17 @@ define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias
; GCN-NEXT: ; iglp_opt mask(0x00000000)
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_u32_e32 v1, s0, v0
-; GCN-NEXT: ds_read_b32 v1, v1
+; GCN-NEXT: ds_read_b32 v2, v1
; GCN-NEXT: v_add_u32_e32 v0, s1, v0
-; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: ds_write_b32 v0, v1
+; GCN-NEXT: ds_write_b32 v0, v2
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: ds_read_b32 v0, v2 offset:256
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: ds_read_b32 v1, v1 offset:256
+; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: ds_write_b32 v1, v0 offset:256
+; GCN-NEXT: ds_write_b32 v0, v1 offset:256
; GCN-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
index 49607e320bd0a..efd5df85280e6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
@@ -39,9 +39,7 @@ define amdgpu_ps void @atomic_cmpswap_1d_agpr(<8 x i32> inreg %rsrc, i32 %s) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a1
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT: image_atomic_cmpswap a[0:1], v0, s[0:7] dmask:0x3 unorm glc
; GFX90A-NEXT: s_endpgm
%cmp = call i32 asm "; def $0", "=a"()
%swap = call i32 asm "; def $0", "=a"()
@@ -70,14 +68,10 @@ define amdgpu_ps void @atomic_cmpswap_1d_64_agpr(<8 x i32> inreg %rsrc, i32 %s)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def a[0:1]
+; GFX90A-NEXT: ; def a[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX90A-NEXT: image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc
+; GFX90A-NEXT: image_atomic_cmpswap a[0:3], v0, s[0:7] dmask:0xf unorm glc
; GFX90A-NEXT: s_endpgm
%cmp = call i64 asm "; def $0", "=a"()
%swap = call i64 asm "; def $0", "=a"()
@@ -92,8 +86,7 @@ define amdgpu_ps void @atomic_swap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0
-; GFX90A-NEXT: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc
+; GFX90A-NEXT: image_atomic_swap a0, v0, s[0:7] dmask:0x1 unorm glc
; GFX90A-NEXT: s_endpgm
%data = call i32 asm "; def $0", "=a"()
%unused = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -106,8 +99,7 @@ define amdgpu_ps void @atomic_add_2d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s, i
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: image_atomic_add v2, v[0:1], s[0:7] dmask:0x1 unorm glc
+; GFX90A-NEXT: image_atomic_add a0, v[0:1], s[0:7] dmask:0x1 unorm glc
; GFX90A-NEXT: s_endpgm
%data = call i32 asm "; def $0", "=a"()
%unused = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
@@ -123,9 +115,7 @@ define amdgpu_ps void @atomic_cmpswap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a1
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT: image_atomic_cmpswap a[0:1], v0, s[0:7] dmask:0x3 unorm glc
; GFX90A-NEXT: s_endpgm
%cmp = call i32 asm "; def $0", "=a"()
%swap = call i32 asm "; def $0", "=a"()
@@ -139,9 +129,7 @@ define amdgpu_ps void @atomic_swap_1d_i64_agpr_noret(<8 x i32> inreg %rsrc, i32
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: image_atomic_swap v[2:3], v0, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT: image_atomic_swap a[0:1], v0, s[0:7] dmask:0x3 unorm glc
; GFX90A-NEXT: s_endpgm
%data = call i64 asm "; def $0", "=a"()
%unused = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -154,14 +142,10 @@ define amdgpu_ps void @atomic_cmpswap_1d_64_agpr_noret(<8 x i32> inreg %rsrc, i3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def a[0:1]
+; GFX90A-NEXT: ; def a[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX90A-NEXT: image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc
+; GFX90A-NEXT: image_atomic_cmpswap a[0:3], v0, s[0:7] dmask:0xf unorm glc
; GFX90A-NEXT: s_endpgm
%cmp = call i64 asm "; def $0", "=a"()
%swap = call i64 asm "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
index 12a998ad82cd2..92a5f88246888 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
@@ -89,59 +89,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; GFX908-NEXT: s_endpgm
@@ -255,25 +255,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 9
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
; GFX908-NEXT: v_accvgpr_read_b32 v4, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a11
; GFX908-NEXT: v_accvgpr_read_b32 v8, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a7
; GFX908-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
; GFX908-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
; GFX908-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; GFX908-NEXT: s_endpgm
@@ -422,22 +422,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
; GFX908-NEXT: v_accvgpr_read_b32 v4, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a11
; GFX908-NEXT: v_accvgpr_read_b32 v8, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a7
; GFX908-NEXT: v_accvgpr_read_b32 v12, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a3
; GFX908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
; GFX908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
; GFX908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
index 87a7c2ef6c95c..c21d86684e445 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
@@ -8,10 +8,10 @@ define <4 x float> @default(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg
; HEURRC-LABEL: default:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
; HEURRC-NEXT: s_nop 7
@@ -34,10 +34,10 @@ define <4 x float> @request_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float>
; HEURRC-LABEL: request_agpr:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
; HEURRC-NEXT: s_nop 7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 5ab8706f28f5f..22bc62acce15d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -726,12 +726,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double
; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], 0
+; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], 0
; GFX90A-VGPR-NEXT: s_nop 3
-; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -742,12 +742,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double
; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[4:5], v[0:1], v[2:3], 0
+; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], 0
; GFX942-VGPR-NEXT: s_nop 3
-; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -765,10 +765,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
; GFX90A-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s11
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v1, s11
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
@@ -779,7 +779,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
@@ -792,10 +792,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
; GFX942-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v3, s11
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT: v_mov_b32_e32 v1, s11
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
@@ -806,7 +806,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
@@ -819,17 +819,17 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX90A-VGPR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX90A-VGPR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s10
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, s10
; GFX90A-VGPR-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s11
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, s11
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 0
@@ -842,17 +842,17 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX942-VGPR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s10
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s10
; GFX942-VGPR-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s11
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s11
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 0
@@ -1629,20 +1629,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s2
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 1
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
@@ -1657,20 +1657,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s2
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
@@ -1743,20 +1743,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s2
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 1
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
@@ -1771,20 +1771,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s2
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index dc4c929124fec..bc4822ef32a3d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -1445,20 +1445,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, s6
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s6
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1467,38 +1467,38 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX942-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s6
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX950-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s6
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s6
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9]
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1507,18 +1507,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX950-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s6
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
; GFX942-AGPRCD: ; %bb.0: ; %bb
@@ -1577,11 +1577,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX942-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s24
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s24
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1592,7 +1592,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_nop 9
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
@@ -1606,11 +1606,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX942-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s24
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s24
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1621,7 +1621,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
@@ -1635,11 +1635,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX950-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s24
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s24
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1650,7 +1650,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_nop 10
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
@@ -1664,11 +1664,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX950-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s24
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s24
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1679,7 +1679,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
@@ -1847,20 +1847,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, s6
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s6
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1869,38 +1869,38 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX942-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s6
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX950-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s6
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s6
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9]
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1909,18 +1909,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX950-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s6
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
; GFX942-AGPRCD: ; %bb.0: ; %bb
@@ -1979,11 +1979,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX942-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s24
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s24
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1994,7 +1994,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_nop 9
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
@@ -2008,11 +2008,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX942-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s24
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s24
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -2023,7 +2023,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
@@ -2037,11 +2037,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX950-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s24
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s24
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -2052,7 +2052,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_nop 10
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
@@ -2066,11 +2066,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX950-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s24
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s24
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -2081,7 +2081,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
@@ -2275,21 +2275,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2322,21 +2322,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -2495,15 +2495,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s20, s18
; GFX942-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -2512,7 +2512,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -2560,15 +2560,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s20, s18
; GFX950-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -2577,7 +2577,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -2789,21 +2789,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2836,21 +2836,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -3000,21 +3000,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -3047,21 +3047,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -3211,21 +3211,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3258,21 +3258,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -3422,21 +3422,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3469,21 +3469,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -3642,15 +3642,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s20, s18
; GFX942-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3659,7 +3659,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -3707,15 +3707,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s20, s18
; GFX950-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3724,7 +3724,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -3945,15 +3945,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s20, s18
; GFX942-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3962,7 +3962,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -4010,15 +4010,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s20, s18
; GFX950-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4027,7 +4027,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -4248,15 +4248,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s20, s18
; GFX942-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4265,7 +4265,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -4313,15 +4313,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s20, s18
; GFX950-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4330,7 +4330,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -4551,15 +4551,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s20, s18
; GFX942-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4568,7 +4568,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -4616,15 +4616,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s20, s18
; GFX950-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4633,7 +4633,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 033a35f69a0bd..68e3afe8b449a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -15,15 +15,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GCN-NEXT: v_mov_b64_e32 v[8:9], 48
-; GCN-NEXT: v_mov_b64_e32 v[10:11], 32
-; GCN-NEXT: v_mov_b64_e32 v[12:13], 16
+; GCN-NEXT: v_mov_b64_e32 v[0:1], 48
+; GCN-NEXT: v_mov_b64_e32 v[2:3], 32
+; GCN-NEXT: v_mov_b64_e32 v[4:5], 16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
; GCN-NEXT: v_accvgpr_write_b32 a0, s8
-; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
; GCN-NEXT: v_accvgpr_write_b32 a1, s9
; GCN-NEXT: v_accvgpr_write_b32 a2, s10
; GCN-NEXT: v_accvgpr_write_b32 a3, s11
@@ -41,40 +41,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
; GCN-NEXT: v_mov_b32_e32 v16, s16
; GCN-NEXT: v_mov_b32_e32 v17, s17
-; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[8:11], v[12:15], a[0:15]
; GCN-NEXT: v_mov_b32_e32 v18, s18
; GCN-NEXT: v_mov_b32_e32 v19, s19
-; GCN-NEXT: v_mov_b32_e32 v0, s20
-; GCN-NEXT: v_mov_b32_e32 v1, s21
-; GCN-NEXT: v_mov_b32_e32 v2, s22
-; GCN-NEXT: v_mov_b32_e32 v3, s23
-; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
+; GCN-NEXT: v_mov_b32_e32 v8, s20
+; GCN-NEXT: v_mov_b32_e32 v9, s21
+; GCN-NEXT: v_mov_b32_e32 v10, s22
+; GCN-NEXT: v_mov_b32_e32 v11, s23
+; GCN-NEXT: v_mov_b64_e32 v[6:7], 0
; GCN-NEXT: s_nop 4
-; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -88,15 +87,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GCN-NEXT: v_mov_b64_e32 v[8:9], 48
-; GCN-NEXT: v_mov_b64_e32 v[10:11], 32
-; GCN-NEXT: v_mov_b64_e32 v[12:13], 16
+; GCN-NEXT: v_mov_b64_e32 v[0:1], 48
+; GCN-NEXT: v_mov_b64_e32 v[2:3], 32
+; GCN-NEXT: v_mov_b64_e32 v[4:5], 16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
; GCN-NEXT: v_accvgpr_write_b32 a0, s8
-; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
; GCN-NEXT: v_accvgpr_write_b32 a1, s9
; GCN-NEXT: v_accvgpr_write_b32 a2, s10
; GCN-NEXT: v_accvgpr_write_b32 a3, s11
@@ -114,40 +113,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
; GCN-NEXT: v_mov_b32_e32 v16, s16
; GCN-NEXT: v_mov_b32_e32 v17, s17
-; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
; GCN-NEXT: v_mov_b32_e32 v18, s18
; GCN-NEXT: v_mov_b32_e32 v19, s19
-; GCN-NEXT: v_mov_b32_e32 v0, s20
-; GCN-NEXT: v_mov_b32_e32 v1, s21
-; GCN-NEXT: v_mov_b32_e32 v2, s22
-; GCN-NEXT: v_mov_b32_e32 v3, s23
-; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
+; GCN-NEXT: v_mov_b32_e32 v8, s20
+; GCN-NEXT: v_mov_b32_e32 v9, s21
+; GCN-NEXT: v_mov_b32_e32 v10, s22
+; GCN-NEXT: v_mov_b32_e32 v11, s23
+; GCN-NEXT: v_mov_b64_e32 v[6:7], 0
; GCN-NEXT: s_nop 4
-; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
@@ -160,22 +158,22 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b
; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a4, v12
-; GCN-NEXT: v_accvgpr_write_b32 a5, v13
-; GCN-NEXT: v_accvgpr_write_b32 a6, v14
-; GCN-NEXT: v_accvgpr_write_b32 a7, v15
-; GCN-NEXT: v_accvgpr_write_b32 a8, v16
-; GCN-NEXT: v_accvgpr_write_b32 a9, v17
-; GCN-NEXT: v_accvgpr_write_b32 a10, v18
-; GCN-NEXT: v_accvgpr_write_b32 a11, v19
-; GCN-NEXT: v_accvgpr_write_b32 a12, v20
-; GCN-NEXT: v_accvgpr_write_b32 a13, v21
-; GCN-NEXT: v_accvgpr_write_b32 a14, v22
; GCN-NEXT: v_accvgpr_write_b32 a15, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
; GCN-NEXT: s_nop 11
@@ -204,22 +202,22 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0,
; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac__flags:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a4, v12
-; GCN-NEXT: v_accvgpr_write_b32 a5, v13
-; GCN-NEXT: v_accvgpr_write_b32 a6, v14
-; GCN-NEXT: v_accvgpr_write_b32 a7, v15
-; GCN-NEXT: v_accvgpr_write_b32 a8, v16
-; GCN-NEXT: v_accvgpr_write_b32 a9, v17
-; GCN-NEXT: v_accvgpr_write_b32 a10, v18
-; GCN-NEXT: v_accvgpr_write_b32 a11, v19
-; GCN-NEXT: v_accvgpr_write_b32 a12, v20
-; GCN-NEXT: v_accvgpr_write_b32 a13, v21
-; GCN-NEXT: v_accvgpr_write_b32 a14, v22
; GCN-NEXT: v_accvgpr_write_b32 a15, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
; GCN-NEXT: s_nop 11
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 753206206180a..03bf33e0d17e6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -12,29 +12,45 @@ declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>,
; --------------------------------------------------------------------
define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_f32_16x16x32_f16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_f32_16x16x32_f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
; HEURRC-NEXT: s_nop 7
@@ -74,29 +90,45 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg
}
define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_f32_16x16x32_f16__flags:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_f32_16x16x32_f16__flags:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_f16__flags:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16__flags:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
; HEURRC-NEXT: s_nop 7
@@ -382,15 +414,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -408,40 +440,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15]
; SDAG-NEXT: v_mov_b32_e32 v18, s18
; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s22
+; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
; SDAG-NEXT: s_nop 4
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -449,15 +480,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -473,34 +504,33 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; GISEL-NEXT: s_nop 8
-; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[2:3], a[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[4:5], a[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[6:7], a[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[0:1], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -508,15 +538,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
@@ -534,40 +564,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
; HEURRC-NEXT: v_mov_b32_e32 v16, s16
; HEURRC-NEXT: v_mov_b32_e32 v17, s17
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15]
; HEURRC-NEXT: v_mov_b32_e32 v18, s18
; HEURRC-NEXT: v_mov_b32_e32 v19, s19
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b32_e32 v8, s20
+; HEURRC-NEXT: v_mov_b32_e32 v9, s21
+; HEURRC-NEXT: v_mov_b32_e32 v10, s22
+; HEURRC-NEXT: v_mov_b32_e32 v11, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0
; HEURRC-NEXT: s_nop 4
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -575,15 +604,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
-; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
-; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
+; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -593,40 +622,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; VGPRRC-NEXT: v_mov_b32_e32 v48, s16
; VGPRRC-NEXT: v_mov_b32_e32 v49, s17
-; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[40:43], v[44:47], v[0:15]
; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0
; VGPRRC-NEXT: s_nop 8
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[48:51], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16:
@@ -765,15 +794,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -791,40 +820,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
; SDAG-NEXT: v_mov_b32_e32 v18, s18
; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s22
+; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
; SDAG-NEXT: s_nop 4
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -832,15 +860,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -856,34 +884,33 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; GISEL-NEXT: s_nop 8
-; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[2:3], a[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[4:5], a[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[6:7], a[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[0:1], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -891,15 +918,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
@@ -917,40 +944,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
; HEURRC-NEXT: v_mov_b32_e32 v16, s16
; HEURRC-NEXT: v_mov_b32_e32 v17, s17
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
; HEURRC-NEXT: v_mov_b32_e32 v18, s18
; HEURRC-NEXT: v_mov_b32_e32 v19, s19
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b32_e32 v8, s20
+; HEURRC-NEXT: v_mov_b32_e32 v9, s21
+; HEURRC-NEXT: v_mov_b32_e32 v10, s22
+; HEURRC-NEXT: v_mov_b32_e32 v11, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0
; HEURRC-NEXT: s_nop 4
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -958,15 +984,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
-; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
-; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
+; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -976,40 +1002,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; VGPRRC-NEXT: v_mov_b32_e32 v48, s16
; VGPRRC-NEXT: v_mov_b32_e32 v49, s17
-; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[40:43], v[44:47], v[0:15] cbsz:2 abid:3 blgp:1
; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0
; VGPRRC-NEXT: s_nop 8
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[48:51], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__flags:
@@ -1144,65 +1170,105 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
}
define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a4, v12
-; GCN-NEXT: v_accvgpr_write_b32 a5, v13
-; GCN-NEXT: v_accvgpr_write_b32 a6, v14
-; GCN-NEXT: v_accvgpr_write_b32 a7, v15
-; GCN-NEXT: v_accvgpr_write_b32 a8, v16
-; GCN-NEXT: v_accvgpr_write_b32 a9, v17
-; GCN-NEXT: v_accvgpr_write_b32 a10, v18
-; GCN-NEXT: v_accvgpr_write_b32 a11, v19
-; GCN-NEXT: v_accvgpr_write_b32 a12, v20
-; GCN-NEXT: v_accvgpr_write_b32 a13, v21
-; GCN-NEXT: v_accvgpr_write_b32 a14, v22
-; GCN-NEXT: v_accvgpr_write_b32 a15, v23
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_f32_32x32x16_f16__mac:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_32x32x16_f16__mac:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
; HEURRC-NEXT: s_nop 11
@@ -1314,65 +1380,105 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
}
define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a4, v12
-; GCN-NEXT: v_accvgpr_write_b32 a5, v13
-; GCN-NEXT: v_accvgpr_write_b32 a6, v14
-; GCN-NEXT: v_accvgpr_write_b32 a7, v15
-; GCN-NEXT: v_accvgpr_write_b32 a8, v16
-; GCN-NEXT: v_accvgpr_write_b32 a9, v17
-; GCN-NEXT: v_accvgpr_write_b32 a10, v18
-; GCN-NEXT: v_accvgpr_write_b32 a11, v19
-; GCN-NEXT: v_accvgpr_write_b32 a12, v20
-; GCN-NEXT: v_accvgpr_write_b32 a13, v21
-; GCN-NEXT: v_accvgpr_write_b32 a14, v22
-; GCN-NEXT: v_accvgpr_write_b32 a15, v23
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
; HEURRC-NEXT: s_nop 11
@@ -2536,29 +2642,45 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32>, <4 x i32>, <4 x i32>, i32 immarg, i32 immarg, i32 immarg)
define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) {
-; GCN-LABEL: test_mfma_i32_16x16x64_i8:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_i32_16x16x64_i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_i32_16x16x64_i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_i32_16x16x64_i8:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
; HEURRC-NEXT: s_nop 7
@@ -2598,29 +2720,45 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4
}
define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) {
-; GCN-LABEL: test_mfma_i32_16x16x64_i8__flags:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_i32_16x16x64_i8__flags:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_i32_16x16x64_i8__flags:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_i32_16x16x64_i8__flags:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
; HEURRC-NEXT: s_nop 7
@@ -3035,15 +3173,15 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3059,34 +3197,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; GISEL-NEXT: s_nop 8
-; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[2:3], a[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[4:5], a[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[6:7], a[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[0:1], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -3447,15 +3584,15 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3471,34 +3608,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; GISEL-NEXT: s_nop 8
-; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[2:3], a[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[4:5], a[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[6:7], a[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[0:1], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -3784,65 +3920,105 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
}
define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) {
-; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a4, v12
-; GCN-NEXT: v_accvgpr_write_b32 a5, v13
-; GCN-NEXT: v_accvgpr_write_b32 a6, v14
-; GCN-NEXT: v_accvgpr_write_b32 a7, v15
-; GCN-NEXT: v_accvgpr_write_b32 a8, v16
-; GCN-NEXT: v_accvgpr_write_b32 a9, v17
-; GCN-NEXT: v_accvgpr_write_b32 a10, v18
-; GCN-NEXT: v_accvgpr_write_b32 a11, v19
-; GCN-NEXT: v_accvgpr_write_b32 a12, v20
-; GCN-NEXT: v_accvgpr_write_b32 a13, v21
-; GCN-NEXT: v_accvgpr_write_b32 a14, v22
-; GCN-NEXT: v_accvgpr_write_b32 a15, v23
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_i32_32x32x32_i8__mac:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_i32_32x32x32_i8__mac:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
; HEURRC-NEXT: s_nop 11
@@ -3954,65 +4130,105 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
}
define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) {
-; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a4, v12
-; GCN-NEXT: v_accvgpr_write_b32 a5, v13
-; GCN-NEXT: v_accvgpr_write_b32 a6, v14
-; GCN-NEXT: v_accvgpr_write_b32 a7, v15
-; GCN-NEXT: v_accvgpr_write_b32 a8, v16
-; GCN-NEXT: v_accvgpr_write_b32 a9, v17
-; GCN-NEXT: v_accvgpr_write_b32 a10, v18
-; GCN-NEXT: v_accvgpr_write_b32 a11, v19
-; GCN-NEXT: v_accvgpr_write_b32 a12, v20
-; GCN-NEXT: v_accvgpr_write_b32 a13, v21
-; GCN-NEXT: v_accvgpr_write_b32 a14, v22
-; GCN-NEXT: v_accvgpr_write_b32 a15, v23
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
; HEURRC-NEXT: s_nop 11
@@ -5299,10 +5515,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat>
; GCN-LABEL: test_mfma_f32_16x16x32_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
; GCN-NEXT: s_nop 7
@@ -5315,10 +5531,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat>
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
; HEURRC-NEXT: s_nop 7
@@ -5361,10 +5577,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x
; GCN-LABEL: test_mfma_f32_16x16x32_bf16__flags:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
; GCN-NEXT: s_nop 7
@@ -5377,10 +5593,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16__flags:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
; HEURRC-NEXT: s_nop 7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
index d24f1f0b526c3..c1946630ef5f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
@@ -52,27 +52,26 @@ define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
-; GFX908-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a3
+; GFX908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
+; GFX908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
+; GFX908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
+; GFX908-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_i32_32x32x8i8:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 7e30af96bb8b9..3d9ebf91e8f47 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -99,59 +99,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; NOLIT-SRCC-NEXT: s_endpgm
@@ -234,59 +234,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; LIT-SRCC-NEXT: s_endpgm
@@ -510,25 +510,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; NOLIT-SRCC-NEXT: s_endpgm
@@ -577,25 +577,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; LIT-SRCC-NEXT: s_endpgm
@@ -864,22 +864,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
@@ -931,22 +931,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
@@ -1257,59 +1257,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:96
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:112
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:64
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:80
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:32
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:48
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37]
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:16
; NOLIT-SRCC-NEXT: s_endpgm
@@ -1396,59 +1396,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:96
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:112
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
-; LIT-SRCC-NEXT: s_nop 1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
+; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:64
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:80
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:32
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:48
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37]
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:16
; LIT-SRCC-NEXT: s_endpgm
@@ -1690,25 +1690,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; NOLIT-SRCC-NEXT: s_endpgm
@@ -1760,25 +1760,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; LIT-SRCC-NEXT: s_endpgm
@@ -2080,22 +2080,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
@@ -2150,22 +2150,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
@@ -2425,7 +2425,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-LABEL: test_mfma_i32_32x32x4i8:
; NOLIT-SRCC: ; %bb.0: ; %bb
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; NOLIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
@@ -2482,7 +2482,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s10
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s11
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s12
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s20
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s20
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1
@@ -2491,7 +2491,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s14
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s15
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, 1
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2
@@ -2500,53 +2500,67 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a25
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a24
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:96
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a31
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a30
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a29
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a28
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
+; NOLIT-SRCC-NEXT: s_nop 1
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
+; NOLIT-SRCC-NEXT: s_nop 1
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
+; NOLIT-SRCC-NEXT: s_nop 1
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
+; NOLIT-SRCC-NEXT: s_nop 1
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; NOLIT-SRCC-NEXT: s_nop 1
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; NOLIT-SRCC-NEXT: s_nop 1
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: s_nop 1
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
+; NOLIT-SRCC-NEXT: s_nop 0
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a3
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:112
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a0
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:64
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a13
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:80
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a8
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:32
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[34:35] offset:48
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[34:35]
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[34:35] offset:16
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; NOLIT-SRCC-NEXT: s_endpgm
;
; LIT-SRCC-LABEL: test_mfma_i32_32x32x4i8:
; LIT-SRCC: ; %bb.0: ; %bb
; LIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; LIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
@@ -2603,7 +2617,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s10
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s11
; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s12
-; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s20
+; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s20
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1
@@ -2612,7 +2626,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s14
; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s15
; LIT-SRCC-NEXT: v_mov_b32_e32 v3, 1
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2
@@ -2621,85 +2635,99 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a25
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a24
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:96
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a31
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a30
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a29
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a28
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a3
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:112
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a0
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:64
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a13
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:80
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a8
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:32
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[34:35] offset:48
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[34:35]
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[34:35] offset:16
-; LIT-SRCC-NEXT: s_endpgm
-;
-; GFX90A-LABEL: test_mfma_i32_32x32x4i8:
-; GFX90A: ; %bb.0: ; %bb
-; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1
-; GFX90A-NEXT: v_mov_b32_e32 v2, 2
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
-; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9
+; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
+; LIT-SRCC-NEXT: s_nop 1
+; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
+; LIT-SRCC-NEXT: s_nop 1
+; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
+; LIT-SRCC-NEXT: s_nop 1
+; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; LIT-SRCC-NEXT: s_nop 1
+; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; LIT-SRCC-NEXT: s_nop 1
+; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: s_nop 1
+; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; LIT-SRCC-NEXT: s_nop 1
+; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
+; LIT-SRCC-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_i32_32x32x4i8:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1
+; GFX90A-NEXT: v_mov_b32_e32 v2, 2
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9
; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10
; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11
; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12
@@ -2843,134 +2871,134 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC: ; %bb.0: ; %bb
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v12, 0
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s0
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s0
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s3
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v13
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s3
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v13
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s4
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s5
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s6
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s6
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v13
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s7
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s8
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s9
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s9
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v13
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s10
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s11
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s12
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s12
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v13
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s15
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s15
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v13
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: s_nop 1
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; NOLIT-SRCC-NEXT: s_endpgm
;
; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8:
; LIT-SRCC: ; %bb.0: ; %bb
; LIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
-; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v12, 0
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s0
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1
; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17
-; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s3
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v13
+; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s3
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v13
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s4
; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s5
-; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s6
+; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s6
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v13
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s7
; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s8
-; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s9
+; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s9
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v13
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s10
; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s11
-; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s12
+; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s12
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v13
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13
; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14
-; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s15
+; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s15
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v13
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
+; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
+; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
+; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: s_nop 1
+; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; LIT-SRCC-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_i32_16x16x4i8:
@@ -3095,30 +3123,37 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 64
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; NOLIT-SRCC-NEXT: s_nop 1
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; NOLIT-SRCC-NEXT: s_nop 1
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: s_nop 1
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; NOLIT-SRCC-NEXT: s_endpgm
;
; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
@@ -3126,30 +3161,33 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v8, 0
; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48
+; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:32
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; LIT-SRCC-NEXT: s_nop 1
+; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: s_nop 1
+; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; LIT-SRCC-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
@@ -3594,59 +3632,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; NOLIT-SRCC-NEXT: s_endpgm
@@ -3730,59 +3768,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; LIT-SRCC-NEXT: s_endpgm
@@ -4011,22 +4049,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; NOLIT-SRCC-NEXT: s_nop 9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
@@ -4078,22 +4116,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; LIT-SRCC-NEXT: s_nop 9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
@@ -4440,32 +4478,32 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; NOLIT-SRCC-NEXT: s_nop 9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; NOLIT-SRCC-NEXT: s_nop 0
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; NOLIT-SRCC-NEXT: s_endpgm
@@ -4478,28 +4516,28 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
; LIT-SRCC-NEXT: v_mov_b32_e32 v8, 0
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0
; LIT-SRCC-NEXT: s_nop 9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48
; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:32
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; LIT-SRCC-NEXT: s_endpgm
@@ -4584,32 +4622,32 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15]
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; NOLIT-SRCC-NEXT: s_nop 0
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; NOLIT-SRCC-NEXT: s_endpgm
@@ -4621,33 +4659,31 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
; LIT-SRCC-NEXT: v_mov_b32_e32 v2, 0x40004000
; LIT-SRCC-NEXT: v_mov_b32_e32 v3, v2
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; LIT-SRCC-NEXT: v_mov_b32_e32 v13, 0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[0:3], s[0:1] offset:48
-; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[4:7], s[0:1] offset:32
-; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[8:11], s[0:1] offset:16
-; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a0
-; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[9:12], s[0:1]
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:48
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:32
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1]
; LIT-SRCC-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_f32_32x32x8f16_imm_splat:
@@ -4751,60 +4787,60 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; NOLIT-SRCC-NEXT: s_nop 0
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; NOLIT-SRCC-NEXT: s_endpgm
@@ -4814,55 +4850,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; LIT-SRCC-NEXT: v_mov_b32_e32 v14, 0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a27
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a26
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a25
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a24
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a23
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a22
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a21
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a25
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a26
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a27
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a20
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a21
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a22
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a23
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:112
-; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:96
-; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:80
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:112
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:96
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:80
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a19
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a18
-; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:48
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a17
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:48
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a16
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:32
-; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:64
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:32
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:64
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1]
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; LIT-SRCC-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_f32_32x32x1f32_imm_splat:
@@ -5055,32 +5091,32 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; NOLIT-SRCC-NEXT: s_nop 9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; NOLIT-SRCC-NEXT: s_nop 0
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; NOLIT-SRCC-NEXT: s_endpgm
@@ -5109,32 +5145,32 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; LIT-SRCC-NEXT: s_nop 9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; LIT-SRCC-NEXT: s_nop 0
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; LIT-SRCC-NEXT: s_endpgm
@@ -5277,60 +5313,60 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; NOLIT-SRCC-NEXT: s_nop 0
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; NOLIT-SRCC-NEXT: s_endpgm
@@ -5376,60 +5412,60 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; LIT-SRCC-NEXT: s_nop 0
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; LIT-SRCC-NEXT: s_endpgm
@@ -5880,40 +5916,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3
; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
@@ -5975,40 +6011,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3
; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index aae14c8cc87b3..52dcfb735a899 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -14,21 +14,37 @@
; fp8 x fp8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -37,21 +53,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -60,21 +92,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -83,21 +131,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -106,21 +170,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -129,21 +209,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -152,21 +248,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -175,21 +287,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -199,21 +327,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -223,21 +367,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons
; fp8 x bf8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 1, ; blgp
@@ -247,21 +407,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 1, ; blgp
@@ -271,21 +447,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons
; fp8 x fp6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 2, ; blgp
@@ -295,21 +487,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 2, ; blgp
@@ -319,21 +527,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons
; fp8 x bf6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 3, ; blgp
@@ -343,21 +567,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 3, ; blgp
@@ -367,21 +607,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons
; fp8 x fp4
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 4, ; blgp
@@ -391,21 +647,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 4, ; blgp
@@ -415,21 +687,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons
; bf8 x fp8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 0, ; blgp
@@ -439,21 +727,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 0, ; blgp
@@ -463,21 +767,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons
; bf8 x bf8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 1, ; blgp
@@ -488,21 +808,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 1, ; blgp
@@ -512,21 +848,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons
; bf8 x fp6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 2, ; blgp
@@ -535,21 +887,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 2, ; blgp
@@ -559,21 +927,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons
; bf8 x bf6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 3, ; blgp
@@ -583,21 +967,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 3, ; blgp
@@ -607,21 +1007,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons
; bf8 x fp4
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 4, ; blgp
@@ -631,21 +1047,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 4, ; blgp
@@ -655,21 +1087,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons
; fp6 x fp8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 0, ; blgp
@@ -679,21 +1127,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 0, ; blgp
@@ -703,21 +1167,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons
; fp6 x bf8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 1, ; blgp
@@ -727,21 +1207,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 1, ; blgp
@@ -751,21 +1247,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons
; fp6 x fp6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 2, ; blgp
@@ -775,45 +1287,77 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
- i32 2, ; cbsz
- i32 2, ; blgp
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <4 x float> %result
}
; fp6 x bf6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 3, ; blgp
@@ -823,21 +1367,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 3, ; blgp
@@ -848,21 +1408,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__cons
; bf6 x fp8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 0, ; blgp
@@ -872,21 +1448,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 0, ; blgp
@@ -896,21 +1488,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons
; bf6 x bf8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 1, ; blgp
@@ -920,21 +1528,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 1, ; blgp
@@ -944,21 +1568,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons
; bf6 x fp6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 2, ; blgp
@@ -968,21 +1608,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 2, ; blgp
@@ -992,45 +1648,77 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__cons
; bf6 x fp4
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
- i32 3, ; cbsz
- i32 4, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <4 x float> %result
-}
-
-; This should be optimized to avoid the scale
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
+ i32 3, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+; This should be optimized to avoid the scale
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 4, ; blgp
@@ -1040,21 +1728,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__cons
; bf6 x bf6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 3, ; blgp
@@ -1064,21 +1768,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 3, ; blgp
@@ -1088,21 +1808,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__cons
; fp6 x fp4
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 4, ; blgp
@@ -1112,21 +1848,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 4, ; blgp
@@ -1136,21 +1888,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__cons
; fp4 x fp8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 0, ; blgp
@@ -1160,21 +1928,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 0, ; blgp
@@ -1184,21 +1968,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons
; fp4 x bf8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 1, ; blgp
@@ -1208,21 +2008,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 1, ; blgp
@@ -1232,21 +2048,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons
; fp4 x fp6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 2, ; blgp
@@ -1256,21 +2088,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 2, ; blgp
@@ -1280,21 +2128,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__cons
; fp4 x bf6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 3, ; blgp
@@ -1304,21 +2168,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 3, ; blgp
@@ -1328,21 +2208,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__cons
; fp4 x fp4
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 4, ; blgp
@@ -1352,21 +2248,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 4, ; blgp
@@ -1379,97 +2291,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__cons
; --------------------------------------------------------------------
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_mov_b32_e32 v16, s0
-; GCN-NEXT: v_mov_b32_e32 v17, s1
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_mov_b32_e32 v16, s0
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_mov_b32_e32 v16, s0
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v14, s0
-; SDAG-NEXT: v_mov_b32_e32 v15, s1
-; SDAG-NEXT: v_mov_b32_e32 v16, s2
-; SDAG-NEXT: v_mov_b32_e32 v17, s3
-; SDAG-NEXT: v_mov_b32_e32 v18, s16
-; SDAG-NEXT: v_mov_b32_e32 v19, s17
-; SDAG-NEXT: v_mov_b32_e32 v20, s18
-; SDAG-NEXT: v_mov_b32_e32 v21, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s28
-; SDAG-NEXT: v_mov_b32_e32 v5, s29
-; SDAG-NEXT: v_mov_b32_e32 v6, s20
-; SDAG-NEXT: v_mov_b32_e32 v7, s21
-; SDAG-NEXT: v_mov_b32_e32 v8, s22
-; SDAG-NEXT: v_mov_b32_e32 v9, s23
-; SDAG-NEXT: v_mov_b32_e32 v10, s24
-; SDAG-NEXT: v_mov_b32_e32 v11, s25
-; SDAG-NEXT: v_mov_b32_e32 v12, s26
-; SDAG-NEXT: v_mov_b32_e32 v13, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v4
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v5
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, s0
+; SDAG-NEXT: v_mov_b32_e32 v17, s1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1477,29 +2309,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s12, s0
-; GISEL-NEXT: s_mov_b32 s13, s1
-; GISEL-NEXT: s_mov_b32 s14, s2
-; GISEL-NEXT: s_mov_b32 s15, s3
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v20, s28
-; GISEL-NEXT: v_mov_b32_e32 v21, s29
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v0
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v1
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_mov_b32_e32 v16, s0
+; GISEL-NEXT: v_mov_b32_e32 v17, s1
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1510,22 +2330,162 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
ret <4 x float> %result
}
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v14, s0
-; SDAG-NEXT: v_mov_b32_e32 v15, s1
-; SDAG-NEXT: v_mov_b32_e32 v16, s2
-; SDAG-NEXT: v_mov_b32_e32 v17, s3
-; SDAG-NEXT: v_mov_b32_e32 v18, s16
-; SDAG-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, s0
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_mov_b32_e32 v16, s0
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, s0
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_mov_b32_e32 v16, s0
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v12, s0
+; SDAG-NEXT: v_mov_b32_e32 v13, s1
+; SDAG-NEXT: v_mov_b32_e32 v14, s2
+; SDAG-NEXT: v_mov_b32_e32 v15, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
+; SDAG-NEXT: v_mov_b32_e32 v4, s20
+; SDAG-NEXT: v_mov_b32_e32 v5, s21
+; SDAG-NEXT: v_mov_b32_e32 v6, s22
+; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v8, s24
+; SDAG-NEXT: v_mov_b32_e32 v9, s25
+; SDAG-NEXT: v_mov_b32_e32 v10, s26
+; SDAG-NEXT: v_mov_b32_e32 v11, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s28
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s29
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v0
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v1
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s28
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s29
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v14, s0
+; SDAG-NEXT: v_mov_b32_e32 v15, s1
+; SDAG-NEXT: v_mov_b32_e32 v16, s2
+; SDAG-NEXT: v_mov_b32_e32 v17, s3
+; SDAG-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-NEXT: v_mov_b32_e32 v19, s17
; SDAG-NEXT: v_mov_b32_e32 v20, s18
; SDAG-NEXT: v_mov_b32_e32 v21, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0]
@@ -1576,10 +2536,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
; SDAG-NEXT: v_mov_b32_e32 v19, s17
; SDAG-NEXT: v_mov_b32_e32 v20, s18
; SDAG-NEXT: v_mov_b32_e32 v21, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0]
@@ -1622,6 +2582,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
; SDAG-NEXT: v_mov_b32_e32 v14, s0
; SDAG-NEXT: v_mov_b32_e32 v15, s1
; SDAG-NEXT: v_mov_b32_e32 v16, s2
@@ -1630,10 +2594,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-NEXT: v_mov_b32_e32 v19, s17
; SDAG-NEXT: v_mov_b32_e32 v20, s18
; SDAG-NEXT: v_mov_b32_e32 v21, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0]
@@ -1652,13 +2612,13 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
; GISEL-NEXT: v_mov_b32_e32 v8, s20
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0]
@@ -1751,14 +2711,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v20, -2
-; SDAG-NEXT: v_mov_b32_e32 v21, 33
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, -2
+; SDAG-NEXT: v_mov_b32_e32 v17, 33
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1791,14 +2751,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v20, -2
-; SDAG-NEXT: v_mov_b32_e32 v21, 0x41
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, -2
+; SDAG-NEXT: v_mov_b32_e32 v17, 0x41
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1831,14 +2791,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v20, 0x4d
-; SDAG-NEXT: v_mov_b32_e32 v21, 0x41
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d
+; SDAG-NEXT: v_mov_b32_e32 v17, 0x41
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -2185,58 +3145,328 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
ret <4 x float> %result
}
; This should be optimized to avoid the scale, with non-0 op_sel arguments.
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0)
ret <4 x float> %result
}
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, 1
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-NEXT: v_mov_b32_e32 v17, 1
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v17, 1
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_mov_b32_e32 v16, 1
+; GISEL-NEXT: v_mov_b32_e32 v17, 0
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; --------------------------------------------------------------------
+; Incorrect signature for format cases (IR vector too large)
+; --------------------------------------------------------------------
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v20, 1
-; SDAG-NEXT: v_mov_b32_e32 v21, 0
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -2244,39 +3474,38 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-NEXT: v_mov_b32_e32 v17, 1
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <4 x float> %result
}
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v20, 0
-; SDAG-NEXT: v_mov_b32_e32 v21, 1
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -2284,162 +3513,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_mov_b32_e32 v16, 1
-; GISEL-NEXT: v_mov_b32_e32 v17, 0
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
- ret <4 x float> %result
-}
-
-; --------------------------------------------------------------------
-; Incorrect signature for format cases (IR vector too large)
-; --------------------------------------------------------------------
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
- i32 0, ; cbsz
- i32 2, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
- i32 2, ; cbsz
- i32 0, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
- i32 2, ; cbsz
- i32 2, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
- i32 2, ; cbsz
- i32 2, ; blgp
- i32 0, i32 0, i32 0, i32 0)
- ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
- i32 0, ; cbsz
- i32 4, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 0, ; blgp
@@ -2448,21 +3536,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 4, ; blgp
@@ -2471,21 +3575,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 0, ; blgp
@@ -2494,21 +3614,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 4, ; blgp
@@ -2517,21 +3653,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 4, ; blgp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index f0205a3a788ed..7b7865e3434db 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -17,27 +17,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -61,11 +61,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -81,7 +81,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -112,27 +112,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,1,0] op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -156,11 +156,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -176,7 +176,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,1,0] op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -207,27 +207,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -251,11 +251,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -271,7 +271,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -302,27 +302,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -346,11 +346,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -366,7 +366,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -397,27 +397,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[0,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[0,1,0] op_sel_hi:[0,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -441,11 +441,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -461,7 +461,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[0,1,0] op_sel_hi:[0,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -492,27 +492,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -536,11 +536,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -556,7 +556,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -587,27 +587,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[0,1,0] op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -631,11 +631,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -651,7 +651,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[0,1,0] op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -682,27 +682,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -726,11 +726,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -746,7 +746,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -775,47 +775,89 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x
; This should be optimized to avoid the scale
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -828,27 +870,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:1
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -872,11 +914,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -892,7 +934,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] blgp:1
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -920,47 +962,89 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 1, ; blgp
@@ -970,325 +1054,29 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons
; fp8 x fp6
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 0, ; cbsz
- i32 2, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 0, ; cbsz
- i32 2, ; blgp
- i32 0, i32 0, i32 0, i32 0)
- ret <16 x float> %result
-}
-
-; fp8 x bf6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 0, ; cbsz
- i32 3, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 0, ; cbsz
- i32 3, ; blgp
- i32 0, i32 0, i32 0, i32 0)
- ret <16 x float> %result
-}
-
-; fp8 x fp4
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 0, ; cbsz
- i32 4, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 0, ; cbsz
- i32 4, ; blgp
- i32 0, i32 0, i32 0, i32 0)
- ret <16 x float> %result
-}
-
-; bf8 x fp8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: scratch_load_dword v14, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:2
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1309,30 +1097,29 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: scratch_load_dword v14, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:2
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1352,88 +1139,35 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 1, ; cbsz
- i32 0, ; blgp
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 1, ; cbsz
- i32 0, ; blgp
- i32 0, i32 0, i32 0, i32 0)
- ret <16 x float> %result
-}
-
-; bf8 x bf8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1454,30 +1188,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1497,1992 +1228,2311 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 1, ; cbsz
- i32 1, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 1, ; cbsz
- i32 1, ; blgp
- i32 0, i32 0, i32 0, i32 0)
- ret <16 x float> %result
-}
-
-; bf8 x fp6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 1, ; cbsz
- i32 2, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 1, ; cbsz
+ i32 0, ; cbsz
i32 2, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; bf8 x bf6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; fp8 x bf6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: scratch_load_dword v14, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:3
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: scratch_load_dword v14, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:3
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 1, ; cbsz
+ i32 0, ; cbsz
i32 3, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 1, ; cbsz
+ i32 0, ; cbsz
i32 3, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; bf8 x fp4
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; fp8 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 1, ; cbsz
+ i32 0, ; cbsz
i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 1, ; cbsz
+ i32 0, ; cbsz
i32 4, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; fp6 x fp8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
+; bf8 x fp8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:1
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:1
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 0, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 0, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; fp6 x bf8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
+; bf8 x bf8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
i32 1, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 1, ; blgp
- i32 0, i32 0, i32 0, i32 0)
- ret <16 x float> %result
-}
-
-; fp6 x fp6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 2, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 2, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 1, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; fp6 x bf6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 3, ; blgp
+; bf8 x fp6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: scratch_load_dword v14, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: scratch_load_dword v14, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 3, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 2, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-
-; bf6 x fp8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 3, ; cbsz
- i32 0, ; blgp
+; bf8 x bf6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: scratch_load_dword v14, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: scratch_load_dword v14, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 3, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 3, ; cbsz
- i32 0, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 3, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; bf6 x bf8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 3, ; cbsz
- i32 1, ; blgp
+; bf8 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 3, ; cbsz
- i32 1, ; blgp
- i32 0, i32 0, i32 0, i32 0)
- ret <16 x float> %result
-}
-
-; bf6 x fp6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 3, ; cbsz
- i32 2, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 3, ; cbsz
- i32 2, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 4, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; bf6 x fp4
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v24
-; GCN-NEXT: v_accvgpr_write_b32 a15, v25
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 3, ; cbsz
- i32 4, ; blgp
+; fp6 x fp8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: scratch_load_dword v14, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: scratch_load_dword v14, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v24
-; GCN-NEXT: v_accvgpr_write_b32 a15, v25
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 3, ; cbsz
- i32 4, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 0, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; bf6 x bf6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 3, ; cbsz
- i32 3, ; blgp
+; fp6 x bf8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: scratch_load_dword v14, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: scratch_load_dword v14, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 1, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 3, ; cbsz
- i32 3, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 1, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; fp6 x fp4
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v24
-; GCN-NEXT: v_accvgpr_write_b32 a15, v25
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+; fp6 x fp6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
- i32 4, ; blgp
+ i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v24
-; GCN-NEXT: v_accvgpr_write_b32 a15, v25
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 4, ; blgp
- i32 0, i32 0, i32 0, i32 0)
- ret <16 x float> %result
-}
-
-; fp4 x fp8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 0, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 0, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; fp4 x bf8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 1, ; blgp
+; fp6 x bf6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 3, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 1, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 3, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; fp4 x fp6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v24
-; GCN-NEXT: v_accvgpr_write_b32 a15, v25
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 2, ; blgp
+
+; bf6 x fp8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: scratch_load_dword v14, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: scratch_load_dword v14, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v24
-; GCN-NEXT: v_accvgpr_write_b32 a15, v25
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 2, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 0, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; fp4 x bf6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v24
-; GCN-NEXT: v_accvgpr_write_b32 a15, v25
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 3, ; blgp
+; bf6 x bf8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: scratch_load_dword v14, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: scratch_load_dword v14, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 1, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v24
-; GCN-NEXT: v_accvgpr_write_b32 a15, v25
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 3, ; blgp
- i32 0, i32 0, i32 0, i32 0)
- ret <16 x float> %result
-}
-
-; fp4 x fp4
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a4, v12
-; GCN-NEXT: v_accvgpr_write_b32 a5, v13
-; GCN-NEXT: v_accvgpr_write_b32 a6, v14
-; GCN-NEXT: v_accvgpr_write_b32 a7, v15
-; GCN-NEXT: v_accvgpr_write_b32 a8, v16
-; GCN-NEXT: v_accvgpr_write_b32 a9, v17
-; GCN-NEXT: v_accvgpr_write_b32 a10, v18
-; GCN-NEXT: v_accvgpr_write_b32 a11, v19
-; GCN-NEXT: v_accvgpr_write_b32 a12, v20
-; GCN-NEXT: v_accvgpr_write_b32 a13, v21
-; GCN-NEXT: v_accvgpr_write_b32 a14, v22
-; GCN-NEXT: v_accvgpr_write_b32 a15, v23
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 4, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a4, v12
-; GCN-NEXT: v_accvgpr_write_b32 a5, v13
-; GCN-NEXT: v_accvgpr_write_b32 a6, v14
-; GCN-NEXT: v_accvgpr_write_b32 a7, v15
-; GCN-NEXT: v_accvgpr_write_b32 a8, v16
-; GCN-NEXT: v_accvgpr_write_b32 a9, v17
-; GCN-NEXT: v_accvgpr_write_b32 a10, v18
-; GCN-NEXT: v_accvgpr_write_b32 a11, v19
-; GCN-NEXT: v_accvgpr_write_b32 a12, v20
-; GCN-NEXT: v_accvgpr_write_b32 a13, v21
-; GCN-NEXT: v_accvgpr_write_b32 a14, v22
-; GCN-NEXT: v_accvgpr_write_b32 a15, v23
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 4, ; blgp
- i32 0, i32 0, i32 0, i32 0)
- ret <16 x float> %result
-}
-
-; --------------------------------------------------------------------
-; Different input parameter classes
-; --------------------------------------------------------------------
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: v_mov_b32_e32 v16, s0
-; GCN-NEXT: v_mov_b32_e32 v17, s1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: v_mov_b32_e32 v16, s0
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v31 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: v_mov_b32_e32 v16, s0
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v16 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v26, s0
-; SDAG-NEXT: v_mov_b32_e32 v27, s1
-; SDAG-NEXT: v_mov_b32_e32 v28, s2
-; SDAG-NEXT: v_mov_b32_e32 v29, s3
-; SDAG-NEXT: v_mov_b32_e32 v30, s16
-; SDAG-NEXT: v_mov_b32_e32 v31, s17
-; SDAG-NEXT: v_mov_b32_e32 v32, s18
-; SDAG-NEXT: v_mov_b32_e32 v33, s19
-; SDAG-NEXT: v_mov_b32_e32 v16, s28
-; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_mov_b32_e32 v18, s20
-; SDAG-NEXT: v_mov_b32_e32 v19, s21
-; SDAG-NEXT: v_mov_b32_e32 v20, s22
-; SDAG-NEXT: v_mov_b32_e32 v21, s23
-; SDAG-NEXT: v_mov_b32_e32 v22, s24
-; SDAG-NEXT: v_mov_b32_e32 v23, s25
-; SDAG-NEXT: v_mov_b32_e32 v24, s26
-; SDAG-NEXT: v_mov_b32_e32 v25, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v2
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v3
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v4
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v5
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v6
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v7
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[18:25], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -3503,41 +3553,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s12, s0
-; GISEL-NEXT: s_mov_b32 s13, s1
-; GISEL-NEXT: s_mov_b32 s14, s2
-; GISEL-NEXT: s_mov_b32 s15, s3
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-NEXT: v_mov_b32_e32 v32, s28
-; GISEL-NEXT: v_mov_b32_e32 v33, s29
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[20:21]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v32
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v33
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v0
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v1
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v2
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v3
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v4
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v5
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v6
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v7
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -3557,43 +3593,37 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
+; bf6 x fp6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v26, s0
-; SDAG-NEXT: v_mov_b32_e32 v27, s1
-; SDAG-NEXT: v_mov_b32_e32 v28, s2
-; SDAG-NEXT: v_mov_b32_e32 v29, s3
-; SDAG-NEXT: v_mov_b32_e32 v30, s16
-; SDAG-NEXT: v_mov_b32_e32 v31, s17
-; SDAG-NEXT: v_mov_b32_e32 v32, s18
-; SDAG-NEXT: v_mov_b32_e32 v33, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -3612,38 +3642,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s12, s0
-; GISEL-NEXT: s_mov_b32 s13, s1
-; GISEL-NEXT: s_mov_b32 s14, s2
-; GISEL-NEXT: s_mov_b32 s15, s3
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
-; GISEL-NEXT: v_mov_b32_e32 v8, s20
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -3661,43 +3681,36 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v26, s0
-; SDAG-NEXT: v_mov_b32_e32 v27, s1
-; SDAG-NEXT: v_mov_b32_e32 v28, s2
-; SDAG-NEXT: v_mov_b32_e32 v29, s3
-; SDAG-NEXT: v_mov_b32_e32 v30, s16
-; SDAG-NEXT: v_mov_b32_e32 v31, s17
-; SDAG-NEXT: v_mov_b32_e32 v32, s18
-; SDAG-NEXT: v_mov_b32_e32 v33, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2
+; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -3716,38 +3729,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s12, s0
-; GISEL-NEXT: s_mov_b32 s13, s1
-; GISEL-NEXT: s_mov_b32 s14, s2
-; GISEL-NEXT: s_mov_b32 s15, s3
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
-; GISEL-NEXT: v_mov_b32_e32 v8, s20
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -3765,43 +3768,37 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
+; bf6 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v26, s0
-; SDAG-NEXT: v_mov_b32_e32 v27, s1
-; SDAG-NEXT: v_mov_b32_e32 v28, s2
-; SDAG-NEXT: v_mov_b32_e32 v29, s3
-; SDAG-NEXT: v_mov_b32_e32 v30, s16
-; SDAG-NEXT: v_mov_b32_e32 v31, s17
-; SDAG-NEXT: v_mov_b32_e32 v32, s18
-; SDAG-NEXT: v_mov_b32_e32 v33, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -3820,38 +3817,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s12, s0
-; GISEL-NEXT: s_mov_b32 s13, s1
-; GISEL-NEXT: s_mov_b32 s14, s2
-; GISEL-NEXT: s_mov_b32 s15, s3
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
-; GISEL-NEXT: v_mov_b32_e32 v8, s20
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v25
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -3869,98 +3856,36 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
-; GCN-NEXT: v_accvgpr_write_b32 a4, s16
-; GCN-NEXT: v_accvgpr_write_b32 a5, s17
-; GCN-NEXT: v_accvgpr_write_b32 a6, s18
-; GCN-NEXT: v_accvgpr_write_b32 a7, s19
-; GCN-NEXT: v_accvgpr_write_b32 a8, s20
-; GCN-NEXT: v_accvgpr_write_b32 a9, s21
-; GCN-NEXT: v_accvgpr_write_b32 a10, s22
-; GCN-NEXT: v_accvgpr_write_b32 a11, s23
-; GCN-NEXT: v_accvgpr_write_b32 a12, s24
-; GCN-NEXT: v_accvgpr_write_b32 a13, s25
-; GCN-NEXT: v_accvgpr_write_b32 a14, s26
-; GCN-NEXT: v_accvgpr_write_b32 a15, s27
-; GCN-NEXT: v_mov_b32_e32 v17, s28
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v26, s0
-; SDAG-NEXT: v_mov_b32_e32 v27, s1
-; SDAG-NEXT: v_mov_b32_e32 v28, s2
-; SDAG-NEXT: v_mov_b32_e32 v29, s3
-; SDAG-NEXT: v_mov_b32_e32 v30, s16
-; SDAG-NEXT: v_mov_b32_e32 v31, s17
-; SDAG-NEXT: v_mov_b32_e32 v32, s18
-; SDAG-NEXT: v_mov_b32_e32 v33, s19
-; SDAG-NEXT: v_mov_b32_e32 v16, s20
-; SDAG-NEXT: v_mov_b32_e32 v17, s21
-; SDAG-NEXT: v_mov_b32_e32 v18, s22
-; SDAG-NEXT: v_mov_b32_e32 v19, s23
-; SDAG-NEXT: v_mov_b32_e32 v20, s24
-; SDAG-NEXT: v_mov_b32_e32 v21, s25
-; SDAG-NEXT: v_mov_b32_e32 v22, s26
-; SDAG-NEXT: v_mov_b32_e32 v23, s27
-; SDAG-NEXT: v_mov_b32_e32 v24, s28
-; SDAG-NEXT: v_mov_b32_e32 v25, s29
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4
+; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -3979,47 +3904,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s12, s0
-; GISEL-NEXT: s_mov_b32 s13, s1
-; GISEL-NEXT: s_mov_b32 s14, s2
-; GISEL-NEXT: s_mov_b32 s15, s3
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-NEXT: v_mov_b32_e32 v24, s20
-; GISEL-NEXT: v_mov_b32_e32 v25, s21
-; GISEL-NEXT: v_mov_b32_e32 v26, s22
-; GISEL-NEXT: v_mov_b32_e32 v27, s23
-; GISEL-NEXT: v_mov_b32_e32 v28, s24
-; GISEL-NEXT: v_mov_b32_e32 v29, s25
-; GISEL-NEXT: v_mov_b32_e32 v30, s26
-; GISEL-NEXT: v_mov_b32_e32 v31, s27
-; GISEL-NEXT: v_mov_b32_e32 v32, s28
-; GISEL-NEXT: v_mov_b32_e32 v33, s29
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v30
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v31
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v32
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v33
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v25
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -4037,37 +3943,37 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
+; bf6 x bf6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_mov_b32_e32 v31, -2
-; SDAG-NEXT: v_mov_b32_e32 v32, 33
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -4086,32 +3992,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 33
-; GISEL-NEXT: v_mov_b32_e32 v32, -2
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -4129,37 +4031,36 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_mov_b32_e32 v31, -2
-; SDAG-NEXT: v_mov_b32_e32 v32, 0x41
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3
+; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -4178,32 +4079,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 0x41
-; GISEL-NEXT: v_mov_b32_e32 v32, -2
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -4221,37 +4118,37 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal:
+; fp6 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_mov_b32_e32 v31, 1.0
-; SDAG-NEXT: v_mov_b32_e32 v32, 0x41
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -4270,32 +4167,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 0x41
-; GISEL-NEXT: v_mov_b32_e32 v32, 1.0
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v25
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -4313,37 +4206,36 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 1065353216)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_mov_b32_e32 v31, -2
-; SDAG-NEXT: v_mov_b32_e32 v32, 1.0
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4
+; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -4362,32 +4254,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 1.0
-; GISEL-NEXT: v_mov_b32_e32 v32, -2
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v25
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -4405,35 +4293,36 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
+; fp4 x fp8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_mov_b32_e32 v31, 1.0
-; SDAG-NEXT: v_mov_b32_e32 v32, 0.15915494
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -4454,30 +4343,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 0.15915494
-; GISEL-NEXT: v_mov_b32_e32 v32, 1.0
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -4497,35 +4383,35 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_mov_b32_e32 v31, 0x4d
-; SDAG-NEXT: v_mov_b32_e32 v32, 0x41
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -4546,30 +4432,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 0x41
-; GISEL-NEXT: v_mov_b32_e32 v32, 0x4d
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -4589,224 +4472,2337 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
+; fp4 x bf8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
-; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
-; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; SDAG-NEXT: v_mov_b32_e32 v16, s8
-; SDAG-NEXT: v_mov_b32_e32 v17, s9
-; SDAG-NEXT: v_mov_b32_e32 v18, s10
-; SDAG-NEXT: v_mov_b32_e32 v19, s11
-; SDAG-NEXT: v_mov_b32_e32 v20, s12
-; SDAG-NEXT: v_mov_b32_e32 v21, s13
-; SDAG-NEXT: v_mov_b32_e32 v22, s14
-; SDAG-NEXT: v_mov_b32_e32 v23, s15
-; SDAG-NEXT: v_mov_b32_e32 v24, s16
-; SDAG-NEXT: v_mov_b32_e32 v25, s17
-; SDAG-NEXT: v_mov_b32_e32 v26, s18
-; SDAG-NEXT: v_mov_b32_e32 v27, s19
-; SDAG-NEXT: v_mov_b32_e32 v28, s20
-; SDAG-NEXT: v_mov_b32_e32 v29, s21
-; SDAG-NEXT: v_mov_b32_e32 v30, s22
-; SDAG-NEXT: v_mov_b32_e32 v31, s23
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; SDAG-NEXT: v_mov_b32_e32 v32, s0
-; SDAG-NEXT: v_mov_b32_e32 v33, s1
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1
; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
-; SDAG-NEXT: s_endpgm
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
-; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT: v_mov_b32_e32 v32, s0
-; GISEL-NEXT: v_mov_b32_e32 v33, s1
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1
; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
-; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
-; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
-; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
-; GISEL-NEXT: s_endpgm
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
- store <16 x float> %result, ptr addrspace(1) %ptr, align 64
- ret void
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
}
-define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, ptr addrspace(1) %ptr) #0 {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
-; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v32, -2
-; SDAG-NEXT: v_mov_b32_e32 v33, 0x41
-; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v16, s8
-; SDAG-NEXT: v_mov_b32_e32 v17, s9
-; SDAG-NEXT: v_mov_b32_e32 v18, s10
-; SDAG-NEXT: v_mov_b32_e32 v19, s11
-; SDAG-NEXT: v_mov_b32_e32 v20, s12
-; SDAG-NEXT: v_mov_b32_e32 v21, s13
-; SDAG-NEXT: v_mov_b32_e32 v22, s14
-; SDAG-NEXT: v_mov_b32_e32 v23, s15
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; SDAG-NEXT: v_mov_b32_e32 v24, s16
-; SDAG-NEXT: v_mov_b32_e32 v25, s17
-; SDAG-NEXT: v_mov_b32_e32 v26, s18
-; SDAG-NEXT: v_mov_b32_e32 v27, s19
-; SDAG-NEXT: v_mov_b32_e32 v28, s20
-; SDAG-NEXT: v_mov_b32_e32 v29, s21
-; SDAG-NEXT: v_mov_b32_e32 v30, s22
-; SDAG-NEXT: v_mov_b32_e32 v31, s23
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1
; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; SDAG-NEXT: s_endpgm
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
-; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v32, 0x41
-; GISEL-NEXT: v_mov_b32_e32 v33, -2
-; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1
; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; GISEL-NEXT: s_endpgm
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2)
- store <16 x float> %result, ptr addrspace(1) %ptr, align 64
- ret void
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 1, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
}
-define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) #1 {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac:
+; fp4 x fp6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s12
-; SDAG-NEXT: v_mov_b32_e32 v3, s13
-; SDAG-NEXT: v_mov_b32_e32 v4, s14
-; SDAG-NEXT: v_mov_b32_e32 v5, s15
-; SDAG-NEXT: v_mov_b32_e32 v6, s16
-; SDAG-NEXT: v_mov_b32_e32 v7, s17
-; SDAG-NEXT: v_mov_b32_e32 v8, s18
-; SDAG-NEXT: v_mov_b32_e32 v9, s19
-; SDAG-NEXT: v_mov_b32_e32 v10, s20
-; SDAG-NEXT: v_mov_b32_e32 v11, s21
-; SDAG-NEXT: v_mov_b32_e32 v12, s22
-; SDAG-NEXT: v_mov_b32_e32 v13, s23
-; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
-; SDAG-NEXT: v_mov_b32_e32 v14, s24
-; SDAG-NEXT: v_mov_b32_e32 v15, s25
-; SDAG-NEXT: v_mov_b32_e32 v16, s26
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v17, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v0, v1 op_sel_hi:[0,0,0]
-; SDAG-NEXT: v_mov_b32_e32 v2, s20
-; SDAG-NEXT: v_mov_b32_e32 v3, s21
-; SDAG-NEXT: v_mov_b32_e32 v4, s22
-; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v25
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v25
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp4 x bf6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v25
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v25
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp4 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4
+; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; --------------------------------------------------------------------
+; Different input parameter classes
+; --------------------------------------------------------------------
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, s0
+; SDAG-NEXT: v_mov_b32_e32 v17, s1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_mov_b32_e32 v16, s0
+; GISEL-NEXT: v_mov_b32_e32 v17, s1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_mov_b32_e32 v17, s0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_mov_b32_e32 v17, s0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_mov_b32_e32 v17, s0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_mov_b32_e32 v17, s0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v16, s0
+; SDAG-NEXT: v_mov_b32_e32 v17, s1
+; SDAG-NEXT: v_mov_b32_e32 v18, s2
+; SDAG-NEXT: v_mov_b32_e32 v19, s3
+; SDAG-NEXT: v_mov_b32_e32 v20, s16
+; SDAG-NEXT: v_mov_b32_e32 v21, s17
+; SDAG-NEXT: v_mov_b32_e32 v22, s18
+; SDAG-NEXT: v_mov_b32_e32 v23, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v8
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v7
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v6
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v5
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v4
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v3
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v2
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s28
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s29
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v4, s24
+; SDAG-NEXT: v_mov_b32_e32 v5, s25
+; SDAG-NEXT: v_mov_b32_e32 v6, s26
+; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v0
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v1
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v2
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v3
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v4
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v5
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v6
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v7
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s28
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s29
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[26:27]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v14, s16
+; SDAG-NEXT: v_mov_b32_e32 v15, s17
+; SDAG-NEXT: v_mov_b32_e32 v16, s18
+; SDAG-NEXT: v_mov_b32_e32 v17, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[10:17], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, s20
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[10:17], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v14, s16
+; SDAG-NEXT: v_mov_b32_e32 v15, s17
+; SDAG-NEXT: v_mov_b32_e32 v16, s18
+; SDAG-NEXT: v_mov_b32_e32 v17, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[10:17], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, s20
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[10:17], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v14, s16
+; SDAG-NEXT: v_mov_b32_e32 v15, s17
+; SDAG-NEXT: v_mov_b32_e32 v16, s18
+; SDAG-NEXT: v_mov_b32_e32 v17, s19
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[10:17], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19]
+; GISEL-NEXT: v_mov_b32_e32 v8, s20
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[10:17], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, s0
+; GCN-NEXT: v_accvgpr_write_b32 a1, s1
+; GCN-NEXT: v_accvgpr_write_b32 a2, s2
+; GCN-NEXT: v_accvgpr_write_b32 a3, s3
+; GCN-NEXT: v_accvgpr_write_b32 a4, s16
+; GCN-NEXT: v_accvgpr_write_b32 a5, s17
+; GCN-NEXT: v_accvgpr_write_b32 a6, s18
+; GCN-NEXT: v_accvgpr_write_b32 a7, s19
+; GCN-NEXT: v_accvgpr_write_b32 a8, s20
+; GCN-NEXT: v_accvgpr_write_b32 a9, s21
+; GCN-NEXT: v_accvgpr_write_b32 a10, s22
+; GCN-NEXT: v_accvgpr_write_b32 a11, s23
+; GCN-NEXT: v_accvgpr_write_b32 a12, s24
+; GCN-NEXT: v_accvgpr_write_b32 a13, s25
+; GCN-NEXT: v_accvgpr_write_b32 a14, s26
+; GCN-NEXT: v_accvgpr_write_b32 a15, s27
+; GCN-NEXT: v_mov_b32_e32 v17, s28
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v16, s0
+; SDAG-NEXT: v_mov_b32_e32 v17, s1
+; SDAG-NEXT: v_mov_b32_e32 v18, s2
+; SDAG-NEXT: v_mov_b32_e32 v19, s3
+; SDAG-NEXT: v_mov_b32_e32 v20, s16
+; SDAG-NEXT: v_mov_b32_e32 v21, s17
+; SDAG-NEXT: v_mov_b32_e32 v22, s18
+; SDAG-NEXT: v_mov_b32_e32 v23, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v8
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s21
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s22
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a4, s24
+; SDAG-NEXT: v_accvgpr_write_b32 a5, s25
+; SDAG-NEXT: v_accvgpr_write_b32 a6, s26
+; SDAG-NEXT: v_accvgpr_write_b32 a7, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a8, s28
+; SDAG-NEXT: v_accvgpr_write_b32 a9, s29
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s20
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s21
+; GISEL-NEXT: v_accvgpr_write_b32 a2, s22
+; GISEL-NEXT: v_accvgpr_write_b32 a3, s23
+; GISEL-NEXT: v_accvgpr_write_b32 a4, s24
+; GISEL-NEXT: v_accvgpr_write_b32 a5, s25
+; GISEL-NEXT: v_accvgpr_write_b32 a6, s26
+; GISEL-NEXT: v_accvgpr_write_b32 a7, s27
+; GISEL-NEXT: v_accvgpr_write_b32 a8, s28
+; GISEL-NEXT: v_accvgpr_write_b32 a9, s29
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, -2
+; SDAG-NEXT: v_mov_b32_e32 v17, 33
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_mov_b32_e32 v16, 33
+; GISEL-NEXT: v_mov_b32_e32 v17, -2
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, -2
+; SDAG-NEXT: v_mov_b32_e32 v17, 0x41
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v17, -2
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, 1.0
+; SDAG-NEXT: v_mov_b32_e32 v17, 0x41
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v17, 1.0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 1065353216)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, -2
+; SDAG-NEXT: v_mov_b32_e32 v17, 1.0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_mov_b32_e32 v16, 1.0
+; GISEL-NEXT: v_mov_b32_e32 v17, -2
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, 1.0
+; SDAG-NEXT: v_mov_b32_e32 v17, 0.15915494
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_mov_b32_e32 v16, 0.15915494
+; GISEL-NEXT: v_mov_b32_e32 v17, 1.0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d
+; SDAG-NEXT: v_mov_b32_e32 v17, 0x41
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77)
+ ret <16 x float> %result
+}
+
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: v_mov_b32_e32 v20, s12
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
+; SDAG-NEXT: v_mov_b32_e32 v22, s14
+; SDAG-NEXT: v_mov_b32_e32 v23, s15
+; SDAG-NEXT: v_mov_b32_e32 v24, s16
+; SDAG-NEXT: v_mov_b32_e32 v25, s17
+; SDAG-NEXT: v_mov_b32_e32 v26, s18
+; SDAG-NEXT: v_mov_b32_e32 v27, s19
+; SDAG-NEXT: v_mov_b32_e32 v28, s20
+; SDAG-NEXT: v_mov_b32_e32 v29, s21
+; SDAG-NEXT: v_mov_b32_e32 v30, s22
+; SDAG-NEXT: v_mov_b32_e32 v31, s23
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; SDAG-NEXT: v_mov_b32_e32 v32, s0
+; SDAG-NEXT: v_mov_b32_e32 v33, s1
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_mov_b32_e32 v32, s0
+; GISEL-NEXT: v_mov_b32_e32 v33, s1
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 2
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
+ store <16 x float> %result, ptr addrspace(1) %ptr, align 64
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, ptr addrspace(1) %ptr) #0 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
+; SDAG-NEXT: v_mov_b32_e32 v32, -2
+; SDAG-NEXT: v_mov_b32_e32 v33, 0x41
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: v_mov_b32_e32 v20, s12
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
+; SDAG-NEXT: v_mov_b32_e32 v22, s14
+; SDAG-NEXT: v_mov_b32_e32 v23, s15
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; SDAG-NEXT: v_mov_b32_e32 v24, s16
+; SDAG-NEXT: v_mov_b32_e32 v25, s17
+; SDAG-NEXT: v_mov_b32_e32 v26, s18
+; SDAG-NEXT: v_mov_b32_e32 v27, s19
+; SDAG-NEXT: v_mov_b32_e32 v28, s20
+; SDAG-NEXT: v_mov_b32_e32 v29, s21
+; SDAG-NEXT: v_mov_b32_e32 v30, s22
+; SDAG-NEXT: v_mov_b32_e32 v31, s23
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
+; GISEL-NEXT: v_mov_b32_e32 v32, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v33, -2
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 2
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2)
+ store <16 x float> %result, ptr addrspace(1) %ptr, align 64
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) #1 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s16
+; SDAG-NEXT: v_mov_b32_e32 v7, s17
+; SDAG-NEXT: v_mov_b32_e32 v8, s18
+; SDAG-NEXT: v_mov_b32_e32 v9, s19
+; SDAG-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-NEXT: v_mov_b32_e32 v11, s21
+; SDAG-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-NEXT: v_mov_b32_e32 v13, s23
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
+; SDAG-NEXT: v_mov_b32_e32 v14, s24
+; SDAG-NEXT: v_mov_b32_e32 v15, s25
+; SDAG-NEXT: v_mov_b32_e32 v16, s26
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v0, v1 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4846,18 +6842,18 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[44:45]
; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[50:51]
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -4873,36 +6869,36 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b32_e32 v20, s0
-; GISEL-NEXT: v_mov_b32_e32 v21, s1
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: v_mov_b32_e32 v4, s0
+; GISEL-NEXT: v_mov_b32_e32 v5, s1
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[6:13], v[14:21], a[0:15], v4, v5 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
+; GISEL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[2:3], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[6:7], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: global_store_dwordx4 v[0:1], a[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[2:3], a[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[4:5], a[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[6:7], a[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
@@ -4996,19 +6992,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v20, 25
-; GISEL-NEXT: v_mov_b32_e32 v21, 42
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
+; GISEL-NEXT: v_mov_b32_e32 v4, 25
+; GISEL-NEXT: v_mov_b32_e32 v5, 42
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[44:45]
; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[50:51]
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -5024,34 +7020,293 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[6:13], v[14:21], a[0:15], v4, v5 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
+; GISEL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[2:3], v[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[6:7], v[20:23], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: global_store_dwordx4 v[0:1], a[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[2:3], a[4:7], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[4:5], a[8:11], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[6:7], a[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
+ store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
+ store volatile <16 x float> %result, ptr addrspace(1) null, align 64
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v32, s12
+; SDAG-NEXT: v_mov_b32_e32 v33, s13
+; SDAG-NEXT: v_mov_b32_e32 v34, s14
+; SDAG-NEXT: v_mov_b32_e32 v35, s15
+; SDAG-NEXT: v_mov_b32_e32 v36, s16
+; SDAG-NEXT: v_mov_b32_e32 v37, s17
+; SDAG-NEXT: v_mov_b32_e32 v38, s18
+; SDAG-NEXT: v_mov_b32_e32 v39, s19
+; SDAG-NEXT: v_mov_b32_e32 v40, s20
+; SDAG-NEXT: v_mov_b32_e32 v41, s21
+; SDAG-NEXT: v_mov_b32_e32 v42, s22
+; SDAG-NEXT: v_mov_b32_e32 v43, s23
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; SDAG-NEXT: v_mov_b32_e32 v44, s24
+; SDAG-NEXT: v_mov_b32_e32 v45, s25
+; SDAG-NEXT: v_mov_b32_e32 v46, s26
+; SDAG-NEXT: v_mov_b32_e32 v47, s27
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
+; SDAG-NEXT: s_nop 14
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
+ store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
+ store volatile <16 x float> %result, ptr addrspace(1) null, align 64
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT: v_mov_b32_e32 v32, 42
+; SDAG-NEXT: v_mov_b32_e32 v33, 25
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: v_mov_b32_e32 v20, s16
+; SDAG-NEXT: v_mov_b32_e32 v21, s17
+; SDAG-NEXT: v_mov_b32_e32 v22, s18
+; SDAG-NEXT: v_mov_b32_e32 v23, s19
+; SDAG-NEXT: v_mov_b32_e32 v24, s20
+; SDAG-NEXT: v_mov_b32_e32 v25, s21
+; SDAG-NEXT: v_mov_b32_e32 v26, s22
+; SDAG-NEXT: v_mov_b32_e32 v27, s23
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; SDAG-NEXT: v_mov_b32_e32 v28, s24
+; SDAG-NEXT: v_mov_b32_e32 v29, s25
+; SDAG-NEXT: v_mov_b32_e32 v30, s26
+; SDAG-NEXT: v_mov_b32_e32 v31, s27
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT: v_mov_b32_e32 v32, 25
+; GISEL-NEXT: v_mov_b32_e32 v33, 42
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
@@ -5060,383 +7315,586 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
ret void
}
-define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v32, s12
-; SDAG-NEXT: v_mov_b32_e32 v33, s13
-; SDAG-NEXT: v_mov_b32_e32 v34, s14
-; SDAG-NEXT: v_mov_b32_e32 v35, s15
-; SDAG-NEXT: v_mov_b32_e32 v36, s16
-; SDAG-NEXT: v_mov_b32_e32 v37, s17
-; SDAG-NEXT: v_mov_b32_e32 v38, s18
-; SDAG-NEXT: v_mov_b32_e32 v39, s19
-; SDAG-NEXT: v_mov_b32_e32 v40, s20
-; SDAG-NEXT: v_mov_b32_e32 v41, s21
-; SDAG-NEXT: v_mov_b32_e32 v42, s22
-; SDAG-NEXT: v_mov_b32_e32 v43, s23
-; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v44, s24
-; SDAG-NEXT: v_mov_b32_e32 v45, s25
-; SDAG-NEXT: v_mov_b32_e32 v46, s26
-; SDAG-NEXT: v_mov_b32_e32 v47, s27
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; SDAG-NEXT: s_nop 14
-; SDAG-NEXT: v_mov_b32_e32 v16, s20
-; SDAG-NEXT: v_mov_b32_e32 v17, s21
-; SDAG-NEXT: v_mov_b32_e32 v18, s22
-; SDAG-NEXT: v_mov_b32_e32 v19, s23
-; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
-; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
-; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
-; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mov_b32_e32 v18, s18
-; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
-; SDAG-NEXT: v_mov_b32_e32 v16, s12
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
-; SDAG-NEXT: v_mov_b32_e32 v18, s14
-; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, 1
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v16, s8
-; SDAG-NEXT: v_mov_b32_e32 v17, s9
-; SDAG-NEXT: v_mov_b32_e32 v18, s10
-; SDAG-NEXT: v_mov_b32_e32 v19, s11
-; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_endpgm
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
-; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
-; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
-; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
-; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-NEXT: v_mov_b32_e32 v17, 1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_endpgm
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
- store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
- store volatile <16 x float> %result, ptr addrspace(1) null, align 64
- ret void
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+ ret <16 x float> %result
}
-define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
-; SDAG-NEXT: v_mov_b32_e32 v32, 42
-; SDAG-NEXT: v_mov_b32_e32 v33, 25
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v16, s12
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
-; SDAG-NEXT: v_mov_b32_e32 v18, s14
-; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s16
-; SDAG-NEXT: v_mov_b32_e32 v21, s17
-; SDAG-NEXT: v_mov_b32_e32 v22, s18
-; SDAG-NEXT: v_mov_b32_e32 v23, s19
-; SDAG-NEXT: v_mov_b32_e32 v24, s20
-; SDAG-NEXT: v_mov_b32_e32 v25, s21
-; SDAG-NEXT: v_mov_b32_e32 v26, s22
-; SDAG-NEXT: v_mov_b32_e32 v27, s23
-; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v28, s24
-; SDAG-NEXT: v_mov_b32_e32 v29, s25
-; SDAG-NEXT: v_mov_b32_e32 v30, s26
-; SDAG-NEXT: v_mov_b32_e32 v31, s27
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v16, s20
-; SDAG-NEXT: v_mov_b32_e32 v17, s21
-; SDAG-NEXT: v_mov_b32_e32 v18, s22
-; SDAG-NEXT: v_mov_b32_e32 v19, s23
-; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
-; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
-; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
-; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mov_b32_e32 v18, s18
-; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
-; SDAG-NEXT: v_mov_b32_e32 v16, s12
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
-; SDAG-NEXT: v_mov_b32_e32 v18, s14
-; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v17, 1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v16, s8
-; SDAG-NEXT: v_mov_b32_e32 v17, s9
-; SDAG-NEXT: v_mov_b32_e32 v18, s10
-; SDAG-NEXT: v_mov_b32_e32 v19, s11
-; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_endpgm
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
-; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v32, 25
-; GISEL-NEXT: v_mov_b32_e32 v33, 42
-; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
-; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_mov_b32_e32 v16, 1
+; GISEL-NEXT: v_mov_b32_e32 v17, 0
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_endpgm
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
- store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
- store volatile <16 x float> %result, ptr addrspace(1) null, align 64
- ret void
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
+ ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+; --------------------------------------------------------------------
+; Incorrect signature for format cases (IR vector too large)
+; --------------------------------------------------------------------
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0)
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:2
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:2
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_mov_b32_e32 v31, 1
-; SDAG-NEXT: v_mov_b32_e32 v32, 0
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -5455,14 +7913,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 0
-; GISEL-NEXT: v_mov_b32_e32 v32, 1
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -5478,9 +7936,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -5498,37 +7955,37 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_mov_b32_e32 v31, 0
-; SDAG-NEXT: v_mov_b32_e32 v32, 1
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2
+; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -5547,12 +8004,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 1
-; GISEL-NEXT: v_mov_b32_e32 v32, 0
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
@@ -5570,9 +8025,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -5590,39 +8044,38 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; --------------------------------------------------------------------
-; Incorrect signature for format cases (IR vector too large)
-; --------------------------------------------------------------------
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:4
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5643,14 +8096,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -5666,7 +8119,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] blgp:4
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5688,36 +8141,36 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
- i32 2, ; blgp
+ i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:4
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5738,14 +8191,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -5761,7 +8214,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:4
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5782,38 +8235,38 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
+ i32 4, ; cbsz
i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: scratch_load_dword v14, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:4
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -5832,31 +8285,31 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: scratch_load_dword v14, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:4
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -5874,86 +8327,37 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 2, ; blgp
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 2, ; blgp
- i32 0, i32 0, i32 0, i32 0)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT: scratch_load_dword v14, off, s32
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:4
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:4
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5974,30 +8378,29 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT: scratch_load_dword v14, off, s32
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:4
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:4
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -6017,40 +8420,39 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 0, ; cbsz
- i32 4, ; blgp
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -6069,14 +8471,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -6092,9 +8494,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -6114,136 +8515,34 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
- i32 0, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 0, ; cbsz
i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 0, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -6263,12 +8562,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
@@ -6286,7 +8583,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
@@ -6305,54 +8602,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 4, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
i32 4, ; blgp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 6eb9449069a52..c2b7e51c43bc8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -149,19 +149,19 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <
; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23]
; GISEL-NEXT: v_mov_b32_e32 v0, s24
; GISEL-NEXT: v_mov_b32_e32 v1, s25
; GISEL-NEXT: v_mov_b32_e32 v2, s26
; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-NEXT: v_mov_b32_e32 v4, s28
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[12:15], v[4:11], v16
+; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v4
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
@@ -247,168 +247,151 @@ bb:
}
define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_f16:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_f16:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x32_f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
+; GCN-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
+; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, v10
+; SDAG-NEXT: v_mov_b32_e32 v15, v9
+; SDAG-NEXT: v_mov_b32_e32 v14, v8
+; SDAG-NEXT: v_mov_b32_e32 v13, v7
+; SDAG-NEXT: v_mov_b32_e32 v12, v6
+; SDAG-NEXT: v_mov_b32_e32 v11, v5
+; SDAG-NEXT: v_mov_b32_e32 v10, v4
+; SDAG-NEXT: v_mov_b32_e32 v9, v3
+; SDAG-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-NEXT: v_mov_b32_e32 v7, v1
+; SDAG-NEXT: v_mov_b32_e32 v6, v0
+; SDAG-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_mov_b32_e32 v22, s20
+; SDAG-NEXT: v_mov_b32_e32 v23, s21
+; SDAG-NEXT: v_mov_b32_e32 v24, s22
+; SDAG-NEXT: v_mov_b32_e32 v25, s23
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v16
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
+; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v18, v0
+; GISEL-NEXT: v_mov_b32_e32 v19, v1
+; GISEL-NEXT: v_mov_b32_e32 v20, v2
+; GISEL-NEXT: v_mov_b32_e32 v21, v3
+; GISEL-NEXT: v_mov_b32_e32 v22, v4
+; GISEL-NEXT: v_mov_b32_e32 v23, v5
+; GISEL-NEXT: v_mov_b32_e32 v24, v6
+; GISEL-NEXT: v_mov_b32_e32 v25, v7
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT: v_mov_b32_e32 v26, v8
+; GISEL-NEXT: v_mov_b32_e32 v27, v9
+; GISEL-NEXT: v_mov_b32_e32 v12, s24
+; GISEL-NEXT: v_mov_b32_e32 v13, s25
+; GISEL-NEXT: v_mov_b32_e32 v14, s26
+; GISEL-NEXT: v_mov_b32_e32 v15, s27
+; GISEL-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-NEXT: v_mov_b32_e32 v17, s29
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[28:31], v[0:7], v10
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_mov_b32_e32 v0, v12
; GISEL-NEXT: v_mov_b32_e32 v1, v13
; GISEL-NEXT: v_mov_b32_e32 v2, v14
@@ -425,104 +408,6 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
; GISEL-NEXT: v_mov_b32_e32 v13, v25
; GISEL-NEXT: v_mov_b32_e32 v14, v26
; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v36, s0
-; SDAG-NEXT: v_mov_b32_e32 v37, s1
-; SDAG-NEXT: v_mov_b32_e32 v38, s2
-; SDAG-NEXT: v_mov_b32_e32 v39, s3
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
-; SDAG-NEXT: v_mov_b32_e32 v16, s28
-; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s20
-; SDAG-NEXT: v_mov_b32_e32 v33, s21
-; SDAG-NEXT: v_mov_b32_e32 v34, s22
-; SDAG-NEXT: v_mov_b32_e32 v35, s23
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v18, v0
-; SDAG-NEXT: v_mov_b32_e32 v19, v1
-; SDAG-NEXT: v_mov_b32_e32 v20, v2
-; SDAG-NEXT: v_mov_b32_e32 v21, v3
-; SDAG-NEXT: v_mov_b32_e32 v22, v4
-; SDAG-NEXT: v_mov_b32_e32 v23, v5
-; SDAG-NEXT: v_mov_b32_e32 v24, v6
-; SDAG-NEXT: v_mov_b32_e32 v25, v7
-; SDAG-NEXT: v_mov_b32_e32 v26, v8
-; SDAG-NEXT: v_mov_b32_e32 v27, v9
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v11, v0
-; GISEL-NEXT: v_mov_b32_e32 v12, v1
-; GISEL-NEXT: v_mov_b32_e32 v13, v2
-; GISEL-NEXT: v_mov_b32_e32 v14, v3
-; GISEL-NEXT: v_mov_b32_e32 v15, v4
-; GISEL-NEXT: v_mov_b32_e32 v16, v5
-; GISEL-NEXT: v_mov_b32_e32 v17, v6
-; GISEL-NEXT: v_mov_b32_e32 v18, v7
-; GISEL-NEXT: v_mov_b32_e32 v19, v8
-; GISEL-NEXT: v_mov_b32_e32 v20, v9
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT: v_mov_b32_e32 v21, v10
-; GISEL-NEXT: v_mov_b32_e32 v0, s24
-; GISEL-NEXT: v_mov_b32_e32 v1, s25
-; GISEL-NEXT: v_mov_b32_e32 v2, s26
-; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
-; GISEL-NEXT: v_mov_b32_e32 v5, s29
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v6, v11
-; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mov_b32_e32 v8, v13
-; GISEL-NEXT: v_mov_b32_e32 v9, v14
-; GISEL-NEXT: v_mov_b32_e32 v10, v15
-; GISEL-NEXT: v_mov_b32_e32 v11, v16
-; GISEL-NEXT: v_mov_b32_e32 v12, v17
-; GISEL-NEXT: v_mov_b32_e32 v13, v18
-; GISEL-NEXT: v_mov_b32_e32 v14, v19
-; GISEL-NEXT: v_mov_b32_e32 v15, v20
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
@@ -779,53 +664,37 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v36, s0
-; GCN-NEXT: v_mov_b32_e32 v37, s1
-; GCN-NEXT: v_mov_b32_e32 v38, s2
-; GCN-NEXT: v_mov_b32_e32 v39, s3
-; GCN-NEXT: v_mov_b32_e32 v13, s25
-; GCN-NEXT: v_mov_b32_e32 v14, s26
-; GCN-NEXT: v_mov_b32_e32 v15, s27
-; GCN-NEXT: v_mov_b32_e32 v16, s28
-; GCN-NEXT: v_mov_b32_e32 v17, s29
-; GCN-NEXT: v_mov_b32_e32 v28, s16
-; GCN-NEXT: v_mov_b32_e32 v29, s17
-; GCN-NEXT: v_mov_b32_e32 v30, s18
-; GCN-NEXT: v_mov_b32_e32 v31, s19
-; GCN-NEXT: v_mov_b32_e32 v32, s20
-; GCN-NEXT: v_mov_b32_e32 v33, s21
-; GCN-NEXT: v_mov_b32_e32 v34, s22
-; GCN-NEXT: v_mov_b32_e32 v35, s23
-; GCN-NEXT: v_mov_b32_e32 v12, s24
-; GCN-NEXT: v_mov_b32_e32 v18, v0
-; GCN-NEXT: v_mov_b32_e32 v19, v1
-; GCN-NEXT: v_mov_b32_e32 v20, v2
-; GCN-NEXT: v_mov_b32_e32 v21, v3
-; GCN-NEXT: v_mov_b32_e32 v22, v4
-; GCN-NEXT: v_mov_b32_e32 v23, v5
-; GCN-NEXT: v_mov_b32_e32 v24, v6
-; GCN-NEXT: v_mov_b32_e32 v25, v7
-; GCN-NEXT: v_mov_b32_e32 v26, v8
-; GCN-NEXT: v_mov_b32_e32 v27, v9
+; GCN-NEXT: v_mov_b32_e32 v26, s0
+; GCN-NEXT: v_mov_b32_e32 v27, s1
+; GCN-NEXT: v_mov_b32_e32 v28, s2
+; GCN-NEXT: v_mov_b32_e32 v29, s3
+; GCN-NEXT: v_mov_b32_e32 v16, v10
+; GCN-NEXT: v_mov_b32_e32 v15, v9
+; GCN-NEXT: v_mov_b32_e32 v14, v8
+; GCN-NEXT: v_mov_b32_e32 v13, v7
+; GCN-NEXT: v_mov_b32_e32 v12, v6
+; GCN-NEXT: v_mov_b32_e32 v11, v5
+; GCN-NEXT: v_mov_b32_e32 v10, v4
+; GCN-NEXT: v_mov_b32_e32 v9, v3
+; GCN-NEXT: v_mov_b32_e32 v8, v2
+; GCN-NEXT: v_mov_b32_e32 v7, v1
+; GCN-NEXT: v_mov_b32_e32 v6, v0
+; GCN-NEXT: v_mov_b32_e32 v0, s24
+; GCN-NEXT: v_mov_b32_e32 v1, s25
+; GCN-NEXT: v_mov_b32_e32 v2, s26
+; GCN-NEXT: v_mov_b32_e32 v3, s27
+; GCN-NEXT: v_mov_b32_e32 v4, s28
+; GCN-NEXT: v_mov_b32_e32 v5, s29
+; GCN-NEXT: v_mov_b32_e32 v18, s16
+; GCN-NEXT: v_mov_b32_e32 v19, s17
+; GCN-NEXT: v_mov_b32_e32 v20, s18
+; GCN-NEXT: v_mov_b32_e32 v21, s19
+; GCN-NEXT: v_mov_b32_e32 v22, s20
+; GCN-NEXT: v_mov_b32_e32 v23, s21
+; GCN-NEXT: v_mov_b32_e32 v24, s22
+; GCN-NEXT: v_mov_b32_e32 v25, s23
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v16
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
@@ -982,19 +851,19 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23]
; GISEL-NEXT: v_mov_b32_e32 v0, s24
; GISEL-NEXT: v_mov_b32_e32 v1, s25
; GISEL-NEXT: v_mov_b32_e32 v2, s26
; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-NEXT: v_mov_b32_e32 v4, s28
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[12:15], v[4:11], v16
+; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[14:17], v[6:13], v4
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x i32> %result
@@ -1086,44 +955,151 @@ bb:
}
define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_i32_32x32x64_i8:
+; GCN-LABEL: test_smfmac_i32_32x32x64_i8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
+ ret <16 x i32> %result
+}
+
+define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
+; GCN-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
+ ret <16 x i32> %result
+}
+
+define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
+; GCN-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
+ ret <16 x i32> %result
+}
+
+define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x i32> inreg %arg2, i32 inreg %arg3) {
+; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, v10
+; SDAG-NEXT: v_mov_b32_e32 v15, v9
+; SDAG-NEXT: v_mov_b32_e32 v14, v8
+; SDAG-NEXT: v_mov_b32_e32 v13, v7
+; SDAG-NEXT: v_mov_b32_e32 v12, v6
+; SDAG-NEXT: v_mov_b32_e32 v11, v5
+; SDAG-NEXT: v_mov_b32_e32 v10, v4
+; SDAG-NEXT: v_mov_b32_e32 v9, v3
+; SDAG-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-NEXT: v_mov_b32_e32 v7, v1
+; SDAG-NEXT: v_mov_b32_e32 v6, v0
+; SDAG-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_mov_b32_e32 v22, s20
+; SDAG-NEXT: v_mov_b32_e32 v23, s21
+; SDAG-NEXT: v_mov_b32_e32 v24, s22
+; SDAG-NEXT: v_mov_b32_e32 v25, s23
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_smfmac_i32_32x32x64_i8:
+; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v18, v0
+; GISEL-NEXT: v_mov_b32_e32 v19, v1
+; GISEL-NEXT: v_mov_b32_e32 v20, v2
+; GISEL-NEXT: v_mov_b32_e32 v21, v3
+; GISEL-NEXT: v_mov_b32_e32 v22, v4
+; GISEL-NEXT: v_mov_b32_e32 v23, v5
+; GISEL-NEXT: v_mov_b32_e32 v24, v6
+; GISEL-NEXT: v_mov_b32_e32 v25, v7
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT: v_mov_b32_e32 v26, v8
+; GISEL-NEXT: v_mov_b32_e32 v27, v9
+; GISEL-NEXT: v_mov_b32_e32 v12, s24
+; GISEL-NEXT: v_mov_b32_e32 v13, s25
+; GISEL-NEXT: v_mov_b32_e32 v14, s26
+; GISEL-NEXT: v_mov_b32_e32 v15, s27
+; GISEL-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-NEXT: v_mov_b32_e32 v17, s29
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[28:31], v[0:7], v10
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_mov_b32_e32 v0, v12
; GISEL-NEXT: v_mov_b32_e32 v1, v13
; GISEL-NEXT: v_mov_b32_e32 v2, v14
@@ -1140,228 +1116,6 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
; GISEL-NEXT: v_mov_b32_e32 v13, v25
; GISEL-NEXT: v_mov_b32_e32 v14, v26
; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
- ret <16 x i32> %result
-}
-
-define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
- ret <16 x i32> %result
-}
-
-define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
- ret <16 x i32> %result
-}
-
-define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x i32> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v36, s0
-; SDAG-NEXT: v_mov_b32_e32 v37, s1
-; SDAG-NEXT: v_mov_b32_e32 v38, s2
-; SDAG-NEXT: v_mov_b32_e32 v39, s3
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
-; SDAG-NEXT: v_mov_b32_e32 v16, s28
-; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s20
-; SDAG-NEXT: v_mov_b32_e32 v33, s21
-; SDAG-NEXT: v_mov_b32_e32 v34, s22
-; SDAG-NEXT: v_mov_b32_e32 v35, s23
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v18, v0
-; SDAG-NEXT: v_mov_b32_e32 v19, v1
-; SDAG-NEXT: v_mov_b32_e32 v20, v2
-; SDAG-NEXT: v_mov_b32_e32 v21, v3
-; SDAG-NEXT: v_mov_b32_e32 v22, v4
-; SDAG-NEXT: v_mov_b32_e32 v23, v5
-; SDAG-NEXT: v_mov_b32_e32 v24, v6
-; SDAG-NEXT: v_mov_b32_e32 v25, v7
-; SDAG-NEXT: v_mov_b32_e32 v26, v8
-; SDAG-NEXT: v_mov_b32_e32 v27, v9
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v11, v0
-; GISEL-NEXT: v_mov_b32_e32 v12, v1
-; GISEL-NEXT: v_mov_b32_e32 v13, v2
-; GISEL-NEXT: v_mov_b32_e32 v14, v3
-; GISEL-NEXT: v_mov_b32_e32 v15, v4
-; GISEL-NEXT: v_mov_b32_e32 v16, v5
-; GISEL-NEXT: v_mov_b32_e32 v17, v6
-; GISEL-NEXT: v_mov_b32_e32 v18, v7
-; GISEL-NEXT: v_mov_b32_e32 v19, v8
-; GISEL-NEXT: v_mov_b32_e32 v20, v9
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT: v_mov_b32_e32 v21, v10
-; GISEL-NEXT: v_mov_b32_e32 v0, s24
-; GISEL-NEXT: v_mov_b32_e32 v1, s25
-; GISEL-NEXT: v_mov_b32_e32 v2, s26
-; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
-; GISEL-NEXT: v_mov_b32_e32 v5, s29
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v6, v11
-; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mov_b32_e32 v8, v13
-; GISEL-NEXT: v_mov_b32_e32 v9, v14
-; GISEL-NEXT: v_mov_b32_e32 v10, v15
-; GISEL-NEXT: v_mov_b32_e32 v11, v16
-; GISEL-NEXT: v_mov_b32_e32 v12, v17
-; GISEL-NEXT: v_mov_b32_e32 v13, v18
-; GISEL-NEXT: v_mov_b32_e32 v14, v19
-; GISEL-NEXT: v_mov_b32_e32 v15, v20
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x i32> %result
@@ -1518,19 +1272,19 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23]
; GISEL-NEXT: v_mov_b32_e32 v0, s24
; GISEL-NEXT: v_mov_b32_e32 v1, s25
; GISEL-NEXT: v_mov_b32_e32 v2, s26
; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-NEXT: v_mov_b32_e32 v4, s28
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[12:15], v[4:11], v16
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[14:17], v[6:13], v4
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
@@ -1687,19 +1441,19 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23]
; GISEL-NEXT: v_mov_b32_e32 v0, s24
; GISEL-NEXT: v_mov_b32_e32 v1, s25
; GISEL-NEXT: v_mov_b32_e32 v2, s26
; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-NEXT: v_mov_b32_e32 v4, s28
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[14:17], v[6:13], v4
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
@@ -1856,19 +1610,19 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23]
; GISEL-NEXT: v_mov_b32_e32 v0, s24
; GISEL-NEXT: v_mov_b32_e32 v1, s25
; GISEL-NEXT: v_mov_b32_e32 v2, s26
; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-NEXT: v_mov_b32_e32 v4, s28
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[14:17], v[6:13], v4
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
@@ -2025,19 +1779,19 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23]
; GISEL-NEXT: v_mov_b32_e32 v0, s24
; GISEL-NEXT: v_mov_b32_e32 v1, s25
; GISEL-NEXT: v_mov_b32_e32 v2, s26
; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-NEXT: v_mov_b32_e32 v4, s28
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[14:17], v[6:13], v4
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
@@ -2129,168 +1883,151 @@ bb:
}
define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
+; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
+; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, v10
+; SDAG-NEXT: v_mov_b32_e32 v15, v9
+; SDAG-NEXT: v_mov_b32_e32 v14, v8
+; SDAG-NEXT: v_mov_b32_e32 v13, v7
+; SDAG-NEXT: v_mov_b32_e32 v12, v6
+; SDAG-NEXT: v_mov_b32_e32 v11, v5
+; SDAG-NEXT: v_mov_b32_e32 v10, v4
+; SDAG-NEXT: v_mov_b32_e32 v9, v3
+; SDAG-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-NEXT: v_mov_b32_e32 v7, v1
+; SDAG-NEXT: v_mov_b32_e32 v6, v0
+; SDAG-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_mov_b32_e32 v22, s20
+; SDAG-NEXT: v_mov_b32_e32 v23, s21
+; SDAG-NEXT: v_mov_b32_e32 v24, s22
+; SDAG-NEXT: v_mov_b32_e32 v25, s23
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
+; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v18, v0
+; GISEL-NEXT: v_mov_b32_e32 v19, v1
+; GISEL-NEXT: v_mov_b32_e32 v20, v2
+; GISEL-NEXT: v_mov_b32_e32 v21, v3
+; GISEL-NEXT: v_mov_b32_e32 v22, v4
+; GISEL-NEXT: v_mov_b32_e32 v23, v5
+; GISEL-NEXT: v_mov_b32_e32 v24, v6
+; GISEL-NEXT: v_mov_b32_e32 v25, v7
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT: v_mov_b32_e32 v26, v8
+; GISEL-NEXT: v_mov_b32_e32 v27, v9
+; GISEL-NEXT: v_mov_b32_e32 v12, s24
+; GISEL-NEXT: v_mov_b32_e32 v13, s25
+; GISEL-NEXT: v_mov_b32_e32 v14, s26
+; GISEL-NEXT: v_mov_b32_e32 v15, s27
+; GISEL-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-NEXT: v_mov_b32_e32 v17, s29
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[28:31], v[0:7], v10
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_mov_b32_e32 v0, v12
; GISEL-NEXT: v_mov_b32_e32 v1, v13
; GISEL-NEXT: v_mov_b32_e32 v2, v14
@@ -2307,104 +2044,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
; GISEL-NEXT: v_mov_b32_e32 v13, v25
; GISEL-NEXT: v_mov_b32_e32 v14, v26
; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v36, s0
-; SDAG-NEXT: v_mov_b32_e32 v37, s1
-; SDAG-NEXT: v_mov_b32_e32 v38, s2
-; SDAG-NEXT: v_mov_b32_e32 v39, s3
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
-; SDAG-NEXT: v_mov_b32_e32 v16, s28
-; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s20
-; SDAG-NEXT: v_mov_b32_e32 v33, s21
-; SDAG-NEXT: v_mov_b32_e32 v34, s22
-; SDAG-NEXT: v_mov_b32_e32 v35, s23
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v18, v0
-; SDAG-NEXT: v_mov_b32_e32 v19, v1
-; SDAG-NEXT: v_mov_b32_e32 v20, v2
-; SDAG-NEXT: v_mov_b32_e32 v21, v3
-; SDAG-NEXT: v_mov_b32_e32 v22, v4
-; SDAG-NEXT: v_mov_b32_e32 v23, v5
-; SDAG-NEXT: v_mov_b32_e32 v24, v6
-; SDAG-NEXT: v_mov_b32_e32 v25, v7
-; SDAG-NEXT: v_mov_b32_e32 v26, v8
-; SDAG-NEXT: v_mov_b32_e32 v27, v9
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v11, v0
-; GISEL-NEXT: v_mov_b32_e32 v12, v1
-; GISEL-NEXT: v_mov_b32_e32 v13, v2
-; GISEL-NEXT: v_mov_b32_e32 v14, v3
-; GISEL-NEXT: v_mov_b32_e32 v15, v4
-; GISEL-NEXT: v_mov_b32_e32 v16, v5
-; GISEL-NEXT: v_mov_b32_e32 v17, v6
-; GISEL-NEXT: v_mov_b32_e32 v18, v7
-; GISEL-NEXT: v_mov_b32_e32 v19, v8
-; GISEL-NEXT: v_mov_b32_e32 v20, v9
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT: v_mov_b32_e32 v21, v10
-; GISEL-NEXT: v_mov_b32_e32 v0, s24
-; GISEL-NEXT: v_mov_b32_e32 v1, s25
-; GISEL-NEXT: v_mov_b32_e32 v2, s26
-; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
-; GISEL-NEXT: v_mov_b32_e32 v5, s29
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v6, v11
-; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mov_b32_e32 v8, v13
-; GISEL-NEXT: v_mov_b32_e32 v9, v14
-; GISEL-NEXT: v_mov_b32_e32 v10, v15
-; GISEL-NEXT: v_mov_b32_e32 v11, v16
-; GISEL-NEXT: v_mov_b32_e32 v12, v17
-; GISEL-NEXT: v_mov_b32_e32 v13, v18
-; GISEL-NEXT: v_mov_b32_e32 v14, v19
-; GISEL-NEXT: v_mov_b32_e32 v15, v20
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
@@ -2496,168 +2135,151 @@ bb:
}
define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
- ret <16 x float> %result
-}
+; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
+ ret <16 x float> %result
+}
define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
+; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
+; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, v10
+; SDAG-NEXT: v_mov_b32_e32 v15, v9
+; SDAG-NEXT: v_mov_b32_e32 v14, v8
+; SDAG-NEXT: v_mov_b32_e32 v13, v7
+; SDAG-NEXT: v_mov_b32_e32 v12, v6
+; SDAG-NEXT: v_mov_b32_e32 v11, v5
+; SDAG-NEXT: v_mov_b32_e32 v10, v4
+; SDAG-NEXT: v_mov_b32_e32 v9, v3
+; SDAG-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-NEXT: v_mov_b32_e32 v7, v1
+; SDAG-NEXT: v_mov_b32_e32 v6, v0
+; SDAG-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_mov_b32_e32 v22, s20
+; SDAG-NEXT: v_mov_b32_e32 v23, s21
+; SDAG-NEXT: v_mov_b32_e32 v24, s22
+; SDAG-NEXT: v_mov_b32_e32 v25, s23
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
+; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v18, v0
+; GISEL-NEXT: v_mov_b32_e32 v19, v1
+; GISEL-NEXT: v_mov_b32_e32 v20, v2
+; GISEL-NEXT: v_mov_b32_e32 v21, v3
+; GISEL-NEXT: v_mov_b32_e32 v22, v4
+; GISEL-NEXT: v_mov_b32_e32 v23, v5
+; GISEL-NEXT: v_mov_b32_e32 v24, v6
+; GISEL-NEXT: v_mov_b32_e32 v25, v7
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT: v_mov_b32_e32 v26, v8
+; GISEL-NEXT: v_mov_b32_e32 v27, v9
+; GISEL-NEXT: v_mov_b32_e32 v12, s24
+; GISEL-NEXT: v_mov_b32_e32 v13, s25
+; GISEL-NEXT: v_mov_b32_e32 v14, s26
+; GISEL-NEXT: v_mov_b32_e32 v15, s27
+; GISEL-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-NEXT: v_mov_b32_e32 v17, s29
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[28:31], v[0:7], v10
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_mov_b32_e32 v0, v12
; GISEL-NEXT: v_mov_b32_e32 v1, v13
; GISEL-NEXT: v_mov_b32_e32 v2, v14
@@ -2674,104 +2296,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
; GISEL-NEXT: v_mov_b32_e32 v13, v25
; GISEL-NEXT: v_mov_b32_e32 v14, v26
; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v36, s0
-; SDAG-NEXT: v_mov_b32_e32 v37, s1
-; SDAG-NEXT: v_mov_b32_e32 v38, s2
-; SDAG-NEXT: v_mov_b32_e32 v39, s3
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
-; SDAG-NEXT: v_mov_b32_e32 v16, s28
-; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s20
-; SDAG-NEXT: v_mov_b32_e32 v33, s21
-; SDAG-NEXT: v_mov_b32_e32 v34, s22
-; SDAG-NEXT: v_mov_b32_e32 v35, s23
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v18, v0
-; SDAG-NEXT: v_mov_b32_e32 v19, v1
-; SDAG-NEXT: v_mov_b32_e32 v20, v2
-; SDAG-NEXT: v_mov_b32_e32 v21, v3
-; SDAG-NEXT: v_mov_b32_e32 v22, v4
-; SDAG-NEXT: v_mov_b32_e32 v23, v5
-; SDAG-NEXT: v_mov_b32_e32 v24, v6
-; SDAG-NEXT: v_mov_b32_e32 v25, v7
-; SDAG-NEXT: v_mov_b32_e32 v26, v8
-; SDAG-NEXT: v_mov_b32_e32 v27, v9
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v11, v0
-; GISEL-NEXT: v_mov_b32_e32 v12, v1
-; GISEL-NEXT: v_mov_b32_e32 v13, v2
-; GISEL-NEXT: v_mov_b32_e32 v14, v3
-; GISEL-NEXT: v_mov_b32_e32 v15, v4
-; GISEL-NEXT: v_mov_b32_e32 v16, v5
-; GISEL-NEXT: v_mov_b32_e32 v17, v6
-; GISEL-NEXT: v_mov_b32_e32 v18, v7
-; GISEL-NEXT: v_mov_b32_e32 v19, v8
-; GISEL-NEXT: v_mov_b32_e32 v20, v9
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT: v_mov_b32_e32 v21, v10
-; GISEL-NEXT: v_mov_b32_e32 v0, s24
-; GISEL-NEXT: v_mov_b32_e32 v1, s25
-; GISEL-NEXT: v_mov_b32_e32 v2, s26
-; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
-; GISEL-NEXT: v_mov_b32_e32 v5, s29
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v6, v11
-; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mov_b32_e32 v8, v13
-; GISEL-NEXT: v_mov_b32_e32 v9, v14
-; GISEL-NEXT: v_mov_b32_e32 v10, v15
-; GISEL-NEXT: v_mov_b32_e32 v11, v16
-; GISEL-NEXT: v_mov_b32_e32 v12, v17
-; GISEL-NEXT: v_mov_b32_e32 v13, v18
-; GISEL-NEXT: v_mov_b32_e32 v14, v19
-; GISEL-NEXT: v_mov_b32_e32 v15, v20
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
@@ -2863,168 +2387,151 @@ bb:
}
define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
+; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
+; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, v10
+; SDAG-NEXT: v_mov_b32_e32 v15, v9
+; SDAG-NEXT: v_mov_b32_e32 v14, v8
+; SDAG-NEXT: v_mov_b32_e32 v13, v7
+; SDAG-NEXT: v_mov_b32_e32 v12, v6
+; SDAG-NEXT: v_mov_b32_e32 v11, v5
+; SDAG-NEXT: v_mov_b32_e32 v10, v4
+; SDAG-NEXT: v_mov_b32_e32 v9, v3
+; SDAG-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-NEXT: v_mov_b32_e32 v7, v1
+; SDAG-NEXT: v_mov_b32_e32 v6, v0
+; SDAG-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_mov_b32_e32 v22, s20
+; SDAG-NEXT: v_mov_b32_e32 v23, s21
+; SDAG-NEXT: v_mov_b32_e32 v24, s22
+; SDAG-NEXT: v_mov_b32_e32 v25, s23
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
+; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v18, v0
+; GISEL-NEXT: v_mov_b32_e32 v19, v1
+; GISEL-NEXT: v_mov_b32_e32 v20, v2
+; GISEL-NEXT: v_mov_b32_e32 v21, v3
+; GISEL-NEXT: v_mov_b32_e32 v22, v4
+; GISEL-NEXT: v_mov_b32_e32 v23, v5
+; GISEL-NEXT: v_mov_b32_e32 v24, v6
+; GISEL-NEXT: v_mov_b32_e32 v25, v7
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT: v_mov_b32_e32 v26, v8
+; GISEL-NEXT: v_mov_b32_e32 v27, v9
+; GISEL-NEXT: v_mov_b32_e32 v12, s24
+; GISEL-NEXT: v_mov_b32_e32 v13, s25
+; GISEL-NEXT: v_mov_b32_e32 v14, s26
+; GISEL-NEXT: v_mov_b32_e32 v15, s27
+; GISEL-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-NEXT: v_mov_b32_e32 v17, s29
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[28:31], v[0:7], v10
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_mov_b32_e32 v0, v12
; GISEL-NEXT: v_mov_b32_e32 v1, v13
; GISEL-NEXT: v_mov_b32_e32 v2, v14
@@ -3041,104 +2548,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
; GISEL-NEXT: v_mov_b32_e32 v13, v25
; GISEL-NEXT: v_mov_b32_e32 v14, v26
; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v36, s0
-; SDAG-NEXT: v_mov_b32_e32 v37, s1
-; SDAG-NEXT: v_mov_b32_e32 v38, s2
-; SDAG-NEXT: v_mov_b32_e32 v39, s3
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
-; SDAG-NEXT: v_mov_b32_e32 v16, s28
-; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s20
-; SDAG-NEXT: v_mov_b32_e32 v33, s21
-; SDAG-NEXT: v_mov_b32_e32 v34, s22
-; SDAG-NEXT: v_mov_b32_e32 v35, s23
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v18, v0
-; SDAG-NEXT: v_mov_b32_e32 v19, v1
-; SDAG-NEXT: v_mov_b32_e32 v20, v2
-; SDAG-NEXT: v_mov_b32_e32 v21, v3
-; SDAG-NEXT: v_mov_b32_e32 v22, v4
-; SDAG-NEXT: v_mov_b32_e32 v23, v5
-; SDAG-NEXT: v_mov_b32_e32 v24, v6
-; SDAG-NEXT: v_mov_b32_e32 v25, v7
-; SDAG-NEXT: v_mov_b32_e32 v26, v8
-; SDAG-NEXT: v_mov_b32_e32 v27, v9
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v11, v0
-; GISEL-NEXT: v_mov_b32_e32 v12, v1
-; GISEL-NEXT: v_mov_b32_e32 v13, v2
-; GISEL-NEXT: v_mov_b32_e32 v14, v3
-; GISEL-NEXT: v_mov_b32_e32 v15, v4
-; GISEL-NEXT: v_mov_b32_e32 v16, v5
-; GISEL-NEXT: v_mov_b32_e32 v17, v6
-; GISEL-NEXT: v_mov_b32_e32 v18, v7
-; GISEL-NEXT: v_mov_b32_e32 v19, v8
-; GISEL-NEXT: v_mov_b32_e32 v20, v9
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT: v_mov_b32_e32 v21, v10
-; GISEL-NEXT: v_mov_b32_e32 v0, s24
-; GISEL-NEXT: v_mov_b32_e32 v1, s25
-; GISEL-NEXT: v_mov_b32_e32 v2, s26
-; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
-; GISEL-NEXT: v_mov_b32_e32 v5, s29
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v6, v11
-; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mov_b32_e32 v8, v13
-; GISEL-NEXT: v_mov_b32_e32 v9, v14
-; GISEL-NEXT: v_mov_b32_e32 v10, v15
-; GISEL-NEXT: v_mov_b32_e32 v11, v16
-; GISEL-NEXT: v_mov_b32_e32 v12, v17
-; GISEL-NEXT: v_mov_b32_e32 v13, v18
-; GISEL-NEXT: v_mov_b32_e32 v14, v19
-; GISEL-NEXT: v_mov_b32_e32 v15, v20
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
@@ -3230,168 +2639,151 @@ bb:
}
define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
+; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
+; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v16, v10
+; SDAG-NEXT: v_mov_b32_e32 v15, v9
+; SDAG-NEXT: v_mov_b32_e32 v14, v8
+; SDAG-NEXT: v_mov_b32_e32 v13, v7
+; SDAG-NEXT: v_mov_b32_e32 v12, v6
+; SDAG-NEXT: v_mov_b32_e32 v11, v5
+; SDAG-NEXT: v_mov_b32_e32 v10, v4
+; SDAG-NEXT: v_mov_b32_e32 v9, v3
+; SDAG-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-NEXT: v_mov_b32_e32 v7, v1
+; SDAG-NEXT: v_mov_b32_e32 v6, v0
+; SDAG-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_mov_b32_e32 v22, s20
+; SDAG-NEXT: v_mov_b32_e32 v23, s21
+; SDAG-NEXT: v_mov_b32_e32 v24, s22
+; SDAG-NEXT: v_mov_b32_e32 v25, s23
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
+; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v48, v0
-; GISEL-NEXT: v_mov_b32_e32 v49, v1
-; GISEL-NEXT: v_mov_b32_e32 v50, v2
-; GISEL-NEXT: v_mov_b32_e32 v51, v3
-; GISEL-NEXT: v_mov_b32_e32 v30, v4
-; GISEL-NEXT: v_mov_b32_e32 v31, v5
-; GISEL-NEXT: v_mov_b32_e32 v32, v6
-; GISEL-NEXT: v_mov_b32_e32 v33, v7
-; GISEL-NEXT: v_mov_b32_e32 v34, v8
-; GISEL-NEXT: v_mov_b32_e32 v35, v9
-; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v18, v0
+; GISEL-NEXT: v_mov_b32_e32 v19, v1
+; GISEL-NEXT: v_mov_b32_e32 v20, v2
+; GISEL-NEXT: v_mov_b32_e32 v21, v3
+; GISEL-NEXT: v_mov_b32_e32 v22, v4
+; GISEL-NEXT: v_mov_b32_e32 v23, v5
+; GISEL-NEXT: v_mov_b32_e32 v24, v6
+; GISEL-NEXT: v_mov_b32_e32 v25, v7
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT: v_mov_b32_e32 v26, v8
+; GISEL-NEXT: v_mov_b32_e32 v27, v9
+; GISEL-NEXT: v_mov_b32_e32 v12, s24
+; GISEL-NEXT: v_mov_b32_e32 v13, s25
+; GISEL-NEXT: v_mov_b32_e32 v14, s26
+; GISEL-NEXT: v_mov_b32_e32 v15, s27
+; GISEL-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-NEXT: v_mov_b32_e32 v17, s29
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[28:31], v[0:7], v10
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_mov_b32_e32 v0, v12
; GISEL-NEXT: v_mov_b32_e32 v1, v13
; GISEL-NEXT: v_mov_b32_e32 v2, v14
@@ -3408,104 +2800,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
; GISEL-NEXT: v_mov_b32_e32 v13, v25
; GISEL-NEXT: v_mov_b32_e32 v14, v26
; GISEL-NEXT: v_mov_b32_e32 v15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v36, s0
-; SDAG-NEXT: v_mov_b32_e32 v37, s1
-; SDAG-NEXT: v_mov_b32_e32 v38, s2
-; SDAG-NEXT: v_mov_b32_e32 v39, s3
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
-; SDAG-NEXT: v_mov_b32_e32 v16, s28
-; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s20
-; SDAG-NEXT: v_mov_b32_e32 v33, s21
-; SDAG-NEXT: v_mov_b32_e32 v34, s22
-; SDAG-NEXT: v_mov_b32_e32 v35, s23
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v18, v0
-; SDAG-NEXT: v_mov_b32_e32 v19, v1
-; SDAG-NEXT: v_mov_b32_e32 v20, v2
-; SDAG-NEXT: v_mov_b32_e32 v21, v3
-; SDAG-NEXT: v_mov_b32_e32 v22, v4
-; SDAG-NEXT: v_mov_b32_e32 v23, v5
-; SDAG-NEXT: v_mov_b32_e32 v24, v6
-; SDAG-NEXT: v_mov_b32_e32 v25, v7
-; SDAG-NEXT: v_mov_b32_e32 v26, v8
-; SDAG-NEXT: v_mov_b32_e32 v27, v9
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v11, v0
-; GISEL-NEXT: v_mov_b32_e32 v12, v1
-; GISEL-NEXT: v_mov_b32_e32 v13, v2
-; GISEL-NEXT: v_mov_b32_e32 v14, v3
-; GISEL-NEXT: v_mov_b32_e32 v15, v4
-; GISEL-NEXT: v_mov_b32_e32 v16, v5
-; GISEL-NEXT: v_mov_b32_e32 v17, v6
-; GISEL-NEXT: v_mov_b32_e32 v18, v7
-; GISEL-NEXT: v_mov_b32_e32 v19, v8
-; GISEL-NEXT: v_mov_b32_e32 v20, v9
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT: v_mov_b32_e32 v21, v10
-; GISEL-NEXT: v_mov_b32_e32 v0, s24
-; GISEL-NEXT: v_mov_b32_e32 v1, s25
-; GISEL-NEXT: v_mov_b32_e32 v2, s26
-; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
-; GISEL-NEXT: v_mov_b32_e32 v5, s29
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v6, v11
-; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mov_b32_e32 v8, v13
-; GISEL-NEXT: v_mov_b32_e32 v9, v14
-; GISEL-NEXT: v_mov_b32_e32 v10, v15
-; GISEL-NEXT: v_mov_b32_e32 v11, v16
-; GISEL-NEXT: v_mov_b32_e32 v12, v17
-; GISEL-NEXT: v_mov_b32_e32 v13, v18
-; GISEL-NEXT: v_mov_b32_e32 v14, v19
-; GISEL-NEXT: v_mov_b32_e32 v15, v20
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
index 4366472c73a0e..d3e171be10802 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
@@ -246,7 +246,6 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[8:11], s12 idxen offen
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr7
-; GFX90A-NEXT: ; implicit-def: $vgpr0
; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB4_1
@@ -280,7 +279,6 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX942-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[4:7], s8 idxen offen
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX942-NEXT: ; implicit-def: $vgpr7
-; GFX942-NEXT: ; implicit-def: $vgpr0
; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB4_1
@@ -420,7 +418,6 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr
; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[8:11], s12 idxen offen
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr7
-; GFX90A-NEXT: ; implicit-def: $vgpr0
; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
@@ -454,7 +451,6 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr
; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s8 idxen offen
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX942-NEXT: ; implicit-def: $vgpr7
-; GFX942-NEXT: ; implicit-def: $vgpr0
; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB5_1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
index 0191a85b33888..5b72e006072df 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
@@ -193,7 +193,8 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[8:11], s12 idxen offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: buffer_atomic_add_f32 v1, v[8:9], s[8:11], s12 idxen offen glc
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr7
; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9
@@ -202,6 +203,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
@@ -227,7 +229,8 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX942-NEXT: s_and_b64 s[0:1], s[0:1], vcc
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[4:7], s8 idxen offen sc0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: buffer_atomic_add_f32 v1, v[8:9], s[4:7], s8 idxen offen sc0
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX942-NEXT: ; implicit-def: $vgpr7
; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9
@@ -236,6 +239,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
@@ -339,7 +343,8 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__
; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[8:11], s12 idxen offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: buffer_atomic_pk_add_f16 v1, v[8:9], s[8:11], s12 idxen offen glc
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr7
; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9
@@ -348,6 +353,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
@@ -373,7 +379,8 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__
; GFX942-NEXT: s_and_b64 s[0:1], s[0:1], vcc
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s8 idxen offen sc0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: buffer_atomic_pk_add_f16 v1, v[8:9], s[4:7], s8 idxen offen sc0
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX942-NEXT: ; implicit-def: $vgpr7
; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9
@@ -382,6 +389,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index 9dac2393fd966..1c04ff3e83326 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -85,7 +85,7 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX942-SDAG-NEXT: s_mov_b32 s3, s12
; GFX942-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX942-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen nt
+; GFX942-SDAG-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen nt
; GFX942-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
; GFX942-SDAG-NEXT: s_mov_b32 s5, s12
@@ -96,9 +96,9 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX942-SDAG-NEXT: s_mov_b32 s2, s1
; GFX942-SDAG-NEXT: s_mov_b32 s3, s12
; GFX942-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX942-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen nt
+; GFX942-SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen nt
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: buffer_nontemporal_load_store:
@@ -115,7 +115,7 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX942-GISEL-NEXT: s_mov_b32 s6, s3
; GFX942-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX942-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen nt
+; GFX942-GISEL-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen nt
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
; GFX942-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30
; GFX942-GISEL-NEXT: s_mov_b32 s4, s7
@@ -126,9 +126,9 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX942-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; GFX942-GISEL-NEXT: s_mov_b32 s6, s3
; GFX942-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX942-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen nt
+; GFX942-GISEL-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen nt
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: buffer_nontemporal_load_store:
@@ -413,7 +413,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-SDAG-NEXT: s_mov_b32 s3, s12
; GFX942-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX942-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen sc0 sc1
; GFX942-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
; GFX942-SDAG-NEXT: s_mov_b32 s5, s12
@@ -424,9 +424,9 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-SDAG-NEXT: s_mov_b32 s2, s1
; GFX942-SDAG-NEXT: s_mov_b32 s3, s12
; GFX942-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX942-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen sc0 sc1
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -443,7 +443,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-GISEL-NEXT: s_mov_b32 s6, s3
; GFX942-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX942-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-GISEL-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen sc0 sc1
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
; GFX942-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30
; GFX942-GISEL-NEXT: s_mov_b32 s4, s7
@@ -454,9 +454,9 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; GFX942-GISEL-NEXT: s_mov_b32 s6, s3
; GFX942-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX942-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
+; GFX942-GISEL-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen sc0 sc1
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
index 9585c486aeb9e..3c4a29c54928d 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GFX908 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GFX90ADAG,GFX90A %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GFX90AGSEL,GFX90A %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942DAG,GFX942 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942GSEL,GFX942 %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
@@ -86,62 +86,254 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg)
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; GFX908-NEXT: s_endpgm
+;
+; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_vgpr:
+; GFX90ADAG: ; %bb.0: ; %bb
+; GFX90ADAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90ADAG-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX90ADAG-NEXT: v_mov_b32_e32 v34, 2.0
+; GFX90ADAG-NEXT: v_mov_b32_e32 v32, 0
+; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90ADAG-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX90ADAG-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90ADAG-NEXT: v_mov_b32_e32 v0, s16
+; GFX90ADAG-NEXT: v_mov_b32_e32 v1, s17
+; GFX90ADAG-NEXT: v_mov_b32_e32 v2, s18
+; GFX90ADAG-NEXT: v_mov_b32_e32 v3, s19
+; GFX90ADAG-NEXT: v_mov_b32_e32 v4, s20
+; GFX90ADAG-NEXT: v_mov_b32_e32 v5, s21
+; GFX90ADAG-NEXT: v_mov_b32_e32 v6, s22
+; GFX90ADAG-NEXT: v_mov_b32_e32 v7, s23
+; GFX90ADAG-NEXT: v_mov_b32_e32 v8, s24
+; GFX90ADAG-NEXT: v_mov_b32_e32 v9, s25
+; GFX90ADAG-NEXT: v_mov_b32_e32 v10, s26
+; GFX90ADAG-NEXT: v_mov_b32_e32 v11, s27
+; GFX90ADAG-NEXT: v_mov_b32_e32 v12, s28
+; GFX90ADAG-NEXT: v_mov_b32_e32 v13, s29
+; GFX90ADAG-NEXT: v_mov_b32_e32 v14, s30
+; GFX90ADAG-NEXT: v_mov_b32_e32 v15, s31
+; GFX90ADAG-NEXT: v_mov_b32_e32 v16, s0
+; GFX90ADAG-NEXT: v_mov_b32_e32 v17, s1
+; GFX90ADAG-NEXT: v_mov_b32_e32 v18, s2
+; GFX90ADAG-NEXT: v_mov_b32_e32 v19, s3
+; GFX90ADAG-NEXT: v_mov_b32_e32 v20, s4
+; GFX90ADAG-NEXT: v_mov_b32_e32 v21, s5
+; GFX90ADAG-NEXT: v_mov_b32_e32 v22, s6
+; GFX90ADAG-NEXT: v_mov_b32_e32 v23, s7
+; GFX90ADAG-NEXT: v_mov_b32_e32 v24, s8
+; GFX90ADAG-NEXT: v_mov_b32_e32 v25, s9
+; GFX90ADAG-NEXT: v_mov_b32_e32 v26, s10
+; GFX90ADAG-NEXT: v_mov_b32_e32 v27, s11
+; GFX90ADAG-NEXT: v_mov_b32_e32 v28, s12
+; GFX90ADAG-NEXT: v_mov_b32_e32 v29, s13
+; GFX90ADAG-NEXT: v_mov_b32_e32 v30, s14
+; GFX90ADAG-NEXT: v_mov_b32_e32 v31, s15
+; GFX90ADAG-NEXT: s_nop 1
+; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
+; GFX90ADAG-NEXT: s_nop 15
+; GFX90ADAG-NEXT: s_nop 2
+; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GFX90ADAG-NEXT: s_endpgm
+;
+; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_vgpr:
+; GFX90AGSEL: ; %bb.0: ; %bb
+; GFX90AGSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v32, 1.0
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v33, 2.0
+; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90AGSEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0
+; GFX90AGSEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90AGSEL-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90AGSEL-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90AGSEL-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90AGSEL-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90AGSEL-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90AGSEL-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90AGSEL-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90AGSEL-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90AGSEL-NEXT: v_pk_mov_b32 v[16:17], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90AGSEL-NEXT: v_pk_mov_b32 v[18:19], s[18:19], s[18:19] op_sel:[0,1]
+; GFX90AGSEL-NEXT: v_pk_mov_b32 v[20:21], s[20:21], s[20:21] op_sel:[0,1]
+; GFX90AGSEL-NEXT: v_pk_mov_b32 v[22:23], s[22:23], s[22:23] op_sel:[0,1]
+; GFX90AGSEL-NEXT: v_pk_mov_b32 v[24:25], s[24:25], s[24:25] op_sel:[0,1]
+; GFX90AGSEL-NEXT: v_pk_mov_b32 v[26:27], s[26:27], s[26:27] op_sel:[0,1]
+; GFX90AGSEL-NEXT: v_pk_mov_b32 v[28:29], s[28:29], s[28:29] op_sel:[0,1]
+; GFX90AGSEL-NEXT: v_pk_mov_b32 v[30:31], s[30:31], s[30:31] op_sel:[0,1]
+; GFX90AGSEL-NEXT: s_nop 1
+; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31]
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v32, 0
+; GFX90AGSEL-NEXT: s_nop 15
+; GFX90AGSEL-NEXT: s_nop 1
+; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX90AGSEL-NEXT: s_endpgm
+;
+; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_vgpr:
+; GFX942DAG: ; %bb.0: ; %bb
+; GFX942DAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942DAG-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX942DAG-NEXT: v_mov_b32_e32 v34, 2.0
+; GFX942DAG-NEXT: v_mov_b32_e32 v32, 0
+; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942DAG-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942DAG-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942DAG-NEXT: v_mov_b32_e32 v0, s16
+; GFX942DAG-NEXT: v_mov_b32_e32 v1, s17
+; GFX942DAG-NEXT: v_mov_b32_e32 v2, s18
+; GFX942DAG-NEXT: v_mov_b32_e32 v3, s19
+; GFX942DAG-NEXT: v_mov_b32_e32 v4, s20
+; GFX942DAG-NEXT: v_mov_b32_e32 v5, s21
+; GFX942DAG-NEXT: v_mov_b32_e32 v6, s22
+; GFX942DAG-NEXT: v_mov_b32_e32 v7, s23
+; GFX942DAG-NEXT: v_mov_b32_e32 v8, s24
+; GFX942DAG-NEXT: v_mov_b32_e32 v9, s25
+; GFX942DAG-NEXT: v_mov_b32_e32 v10, s26
+; GFX942DAG-NEXT: v_mov_b32_e32 v11, s27
+; GFX942DAG-NEXT: v_mov_b32_e32 v12, s28
+; GFX942DAG-NEXT: v_mov_b32_e32 v13, s29
+; GFX942DAG-NEXT: v_mov_b32_e32 v14, s30
+; GFX942DAG-NEXT: v_mov_b32_e32 v15, s31
+; GFX942DAG-NEXT: v_mov_b32_e32 v16, s0
+; GFX942DAG-NEXT: v_mov_b32_e32 v17, s1
+; GFX942DAG-NEXT: v_mov_b32_e32 v18, s2
+; GFX942DAG-NEXT: v_mov_b32_e32 v19, s3
+; GFX942DAG-NEXT: v_mov_b32_e32 v20, s4
+; GFX942DAG-NEXT: v_mov_b32_e32 v21, s5
+; GFX942DAG-NEXT: v_mov_b32_e32 v22, s6
+; GFX942DAG-NEXT: v_mov_b32_e32 v23, s7
+; GFX942DAG-NEXT: v_mov_b32_e32 v24, s8
+; GFX942DAG-NEXT: v_mov_b32_e32 v25, s9
+; GFX942DAG-NEXT: v_mov_b32_e32 v26, s10
+; GFX942DAG-NEXT: v_mov_b32_e32 v27, s11
+; GFX942DAG-NEXT: v_mov_b32_e32 v28, s12
+; GFX942DAG-NEXT: v_mov_b32_e32 v29, s13
+; GFX942DAG-NEXT: v_mov_b32_e32 v30, s14
+; GFX942DAG-NEXT: v_mov_b32_e32 v31, s15
+; GFX942DAG-NEXT: s_nop 1
+; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31]
+; GFX942DAG-NEXT: s_nop 15
+; GFX942DAG-NEXT: s_nop 1
+; GFX942DAG-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX942DAG-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX942DAG-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX942DAG-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX942DAG-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX942DAG-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX942DAG-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX942DAG-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GFX942DAG-NEXT: s_endpgm
+;
+; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_vgpr:
+; GFX942GSEL: ; %bb.0: ; %bb
+; GFX942GSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942GSEL-NEXT: v_mov_b32_e32 v32, 1.0
+; GFX942GSEL-NEXT: v_mov_b32_e32 v33, 2.0
+; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942GSEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0
+; GFX942GSEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942GSEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942GSEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942GSEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942GSEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942GSEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942GSEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942GSEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942GSEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942GSEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
+; GFX942GSEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
+; GFX942GSEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
+; GFX942GSEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
+; GFX942GSEL-NEXT: v_mov_b64_e32 v[24:25], s[24:25]
+; GFX942GSEL-NEXT: v_mov_b64_e32 v[26:27], s[26:27]
+; GFX942GSEL-NEXT: v_mov_b64_e32 v[28:29], s[28:29]
+; GFX942GSEL-NEXT: v_mov_b64_e32 v[30:31], s[30:31]
+; GFX942GSEL-NEXT: s_nop 1
+; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; GFX942GSEL-NEXT: v_mov_b32_e32 v32, 0
+; GFX942GSEL-NEXT: s_nop 15
+; GFX942GSEL-NEXT: s_nop 0
+; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX942GSEL-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -228,62 +420,286 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg)
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; GFX908-NEXT: s_endpgm
+;
+; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_agpr:
+; GFX90ADAG: ; %bb.0: ; %bb
+; GFX90ADAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX90ADAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90ADAG-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX90ADAG-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a0, s16
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a1, s17
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a2, s18
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a3, s19
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a4, s20
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a5, s21
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a6, s22
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a7, s23
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a8, s24
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a9, s25
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a10, s26
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a11, s27
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a12, s28
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a13, s29
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a14, s30
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a15, s31
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a16, s0
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a17, s1
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a18, s2
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a19, s3
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a20, s4
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a21, s5
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a22, s6
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a23, s7
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a24, s8
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a25, s9
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a26, s10
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a27, s11
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a28, s12
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a29, s13
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a30, s14
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a31, s15
+; GFX90ADAG-NEXT: s_nop 1
+; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GFX90ADAG-NEXT: s_nop 15
+; GFX90ADAG-NEXT: s_nop 2
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
+; GFX90ADAG-NEXT: s_endpgm
+;
+; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_agpr:
+; GFX90AGSEL: ; %bb.0: ; %bb
+; GFX90AGSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90AGSEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0
+; GFX90AGSEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a16, s16
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a17, s17
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a18, s18
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a19, s19
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a20, s20
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a21, s21
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a22, s22
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a23, s23
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a24, s24
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a25, s25
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a26, s26
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a27, s27
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a28, s28
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a29, s29
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a30, s30
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a31, s31
+; GFX90AGSEL-NEXT: s_nop 1
+; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX90AGSEL-NEXT: s_nop 15
+; GFX90AGSEL-NEXT: s_nop 1
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
+; GFX90AGSEL-NEXT: s_endpgm
+;
+; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_agpr:
+; GFX942DAG: ; %bb.0: ; %bb
+; GFX942DAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942DAG-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942DAG-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX942DAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942DAG-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942DAG-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a0, s16
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a1, s17
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a2, s18
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a3, s19
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a4, s20
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a5, s21
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a6, s22
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a7, s23
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a8, s24
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a9, s25
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a10, s26
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a11, s27
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a12, s28
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a13, s29
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a14, s30
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a15, s31
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a16, s0
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a17, s1
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a18, s2
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a19, s3
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a20, s4
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a21, s5
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a22, s6
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a23, s7
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a24, s8
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a25, s9
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a26, s10
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a27, s11
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a28, s12
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a29, s13
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a30, s14
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a31, s15
+; GFX942DAG-NEXT: s_nop 1
+; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
+; GFX942DAG-NEXT: s_nop 15
+; GFX942DAG-NEXT: s_nop 1
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
+; GFX942DAG-NEXT: s_endpgm
+;
+; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_agpr:
+; GFX942GSEL: ; %bb.0: ; %bb
+; GFX942GSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942GSEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0
+; GFX942GSEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a16, s16
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a17, s17
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a18, s18
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a19, s19
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a20, s20
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a21, s21
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a22, s22
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a23, s23
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a24, s24
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a25, s25
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a26, s26
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a27, s27
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a28, s28
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a29, s29
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a30, s30
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a31, s31
+; GFX942GSEL-NEXT: s_nop 1
+; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942GSEL-NEXT: s_nop 15
+; GFX942GSEL-NEXT: s_nop 0
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
+; GFX942GSEL-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -347,40 +763,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
@@ -389,6 +805,134 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr
; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX908-NEXT: s_endpgm
+;
+; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr:
+; GFX90ADAG: ; %bb.0: ; %bb
+; GFX90ADAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90ADAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90ADAG-NEXT: ;;#ASMSTART
+; GFX90ADAG-NEXT: ; def a0
+; GFX90ADAG-NEXT: ;;#ASMEND
+; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90ADAG-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
+; GFX90ADAG-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
+; GFX90ADAG-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
+; GFX90ADAG-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
+; GFX90ADAG-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
+; GFX90ADAG-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
+; GFX90ADAG-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
+; GFX90ADAG-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
+; GFX90ADAG-NEXT: s_waitcnt vmcnt(0)
+; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GFX90ADAG-NEXT: s_nop 15
+; GFX90ADAG-NEXT: s_nop 2
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90ADAG-NEXT: s_endpgm
+;
+; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr:
+; GFX90AGSEL: ; %bb.0: ; %bb
+; GFX90AGSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90AGSEL-NEXT: ;;#ASMSTART
+; GFX90AGSEL-NEXT: ; def a0
+; GFX90AGSEL-NEXT: ;;#ASMEND
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
+; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0)
+; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GFX90AGSEL-NEXT: s_nop 15
+; GFX90AGSEL-NEXT: s_nop 2
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90AGSEL-NEXT: s_endpgm
+;
+; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr:
+; GFX942DAG: ; %bb.0: ; %bb
+; GFX942DAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942DAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942DAG-NEXT: ;;#ASMSTART
+; GFX942DAG-NEXT: ; def a0
+; GFX942DAG-NEXT: ;;#ASMEND
+; GFX942DAG-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942DAG-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942DAG-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
+; GFX942DAG-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
+; GFX942DAG-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
+; GFX942DAG-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
+; GFX942DAG-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
+; GFX942DAG-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
+; GFX942DAG-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
+; GFX942DAG-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
+; GFX942DAG-NEXT: s_waitcnt vmcnt(0)
+; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
+; GFX942DAG-NEXT: s_nop 15
+; GFX942DAG-NEXT: s_nop 1
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942DAG-NEXT: s_endpgm
+;
+; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr:
+; GFX942GSEL: ; %bb.0: ; %bb
+; GFX942GSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942GSEL-NEXT: ;;#ASMSTART
+; GFX942GSEL-NEXT: ; def a0
+; GFX942GSEL-NEXT: ;;#ASMEND
+; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942GSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
+; GFX942GSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
+; GFX942GSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
+; GFX942GSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
+; GFX942GSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
+; GFX942GSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
+; GFX942GSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
+; GFX942GSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
+; GFX942GSEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
+; GFX942GSEL-NEXT: s_nop 15
+; GFX942GSEL-NEXT: s_nop 1
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX942GSEL-NEXT: s_endpgm
bb:
%acc = call i32 asm sideeffect "; def $0", "={a0}"()
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -453,40 +997,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
@@ -495,6 +1039,134 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add
; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX908-NEXT: s_endpgm
+;
+; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr:
+; GFX90ADAG: ; %bb.0: ; %bb
+; GFX90ADAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90ADAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX90ADAG-NEXT: ;;#ASMSTART
+; GFX90ADAG-NEXT: ; use a[100:131]
+; GFX90ADAG-NEXT: ;;#ASMEND
+; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90ADAG-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
+; GFX90ADAG-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
+; GFX90ADAG-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
+; GFX90ADAG-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
+; GFX90ADAG-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
+; GFX90ADAG-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
+; GFX90ADAG-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
+; GFX90ADAG-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
+; GFX90ADAG-NEXT: s_waitcnt vmcnt(0)
+; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GFX90ADAG-NEXT: s_nop 15
+; GFX90ADAG-NEXT: s_nop 2
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90ADAG-NEXT: s_endpgm
+;
+; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr:
+; GFX90AGSEL: ; %bb.0: ; %bb
+; GFX90AGSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX90AGSEL-NEXT: ;;#ASMSTART
+; GFX90AGSEL-NEXT: ; use a[100:131]
+; GFX90AGSEL-NEXT: ;;#ASMEND
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
+; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0)
+; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GFX90AGSEL-NEXT: s_nop 15
+; GFX90AGSEL-NEXT: s_nop 2
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90AGSEL-NEXT: s_endpgm
+;
+; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr:
+; GFX942DAG: ; %bb.0: ; %bb
+; GFX942DAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942DAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942DAG-NEXT: ;;#ASMSTART
+; GFX942DAG-NEXT: ; use a[100:131]
+; GFX942DAG-NEXT: ;;#ASMEND
+; GFX942DAG-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942DAG-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942DAG-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
+; GFX942DAG-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
+; GFX942DAG-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
+; GFX942DAG-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
+; GFX942DAG-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
+; GFX942DAG-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
+; GFX942DAG-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
+; GFX942DAG-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
+; GFX942DAG-NEXT: s_waitcnt vmcnt(0)
+; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
+; GFX942DAG-NEXT: s_nop 15
+; GFX942DAG-NEXT: s_nop 1
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942DAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942DAG-NEXT: s_endpgm
+;
+; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr:
+; GFX942GSEL: ; %bb.0: ; %bb
+; GFX942GSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942GSEL-NEXT: ;;#ASMSTART
+; GFX942GSEL-NEXT: ; use a[100:131]
+; GFX942GSEL-NEXT: ;;#ASMEND
+; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942GSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
+; GFX942GSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
+; GFX942GSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
+; GFX942GSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
+; GFX942GSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
+; GFX942GSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
+; GFX942GSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
+; GFX942GSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
+; GFX942GSEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
+; GFX942GSEL-NEXT: s_nop 15
+; GFX942GSEL-NEXT: s_nop 1
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX942GSEL-NEXT: s_endpgm
bb:
call void asm sideeffect "; use $0", "{a[100:131]}"(<32 x float> poison)
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -559,40 +1231,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
@@ -601,6 +1273,134 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr
; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX908-NEXT: s_endpgm
+;
+; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs:
+; GFX90ADAG: ; %bb.0: ; %bb
+; GFX90ADAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90ADAG-NEXT: v_mov_b32_e32 v32, 0
+; GFX90ADAG-NEXT: ;;#ASMSTART
+; GFX90ADAG-NEXT: ; def v0
+; GFX90ADAG-NEXT: ;;#ASMEND
+; GFX90ADAG-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX90ADAG-NEXT: v_mov_b32_e32 v34, 2.0
+; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90ADAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX90ADAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX90ADAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX90ADAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX90ADAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX90ADAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX90ADAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX90ADAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX90ADAG-NEXT: s_waitcnt vmcnt(0)
+; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
+; GFX90ADAG-NEXT: s_nop 15
+; GFX90ADAG-NEXT: s_nop 2
+; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX90ADAG-NEXT: s_endpgm
+;
+; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs:
+; GFX90AGSEL: ; %bb.0: ; %bb
+; GFX90AGSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90AGSEL-NEXT: ;;#ASMSTART
+; GFX90AGSEL-NEXT: ; def v0
+; GFX90AGSEL-NEXT: ;;#ASMEND
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v32, 0
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v34, 2.0
+; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90AGSEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX90AGSEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX90AGSEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX90AGSEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX90AGSEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX90AGSEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX90AGSEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX90AGSEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0)
+; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
+; GFX90AGSEL-NEXT: s_nop 15
+; GFX90AGSEL-NEXT: s_nop 2
+; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX90AGSEL-NEXT: s_endpgm
+;
+; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs:
+; GFX942DAG: ; %bb.0: ; %bb
+; GFX942DAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942DAG-NEXT: v_mov_b32_e32 v32, 0
+; GFX942DAG-NEXT: ;;#ASMSTART
+; GFX942DAG-NEXT: ; def v0
+; GFX942DAG-NEXT: ;;#ASMEND
+; GFX942DAG-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX942DAG-NEXT: v_mov_b32_e32 v34, 2.0
+; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942DAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX942DAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX942DAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX942DAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX942DAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX942DAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX942DAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX942DAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX942DAG-NEXT: s_waitcnt vmcnt(0)
+; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31]
+; GFX942DAG-NEXT: s_nop 15
+; GFX942DAG-NEXT: s_nop 1
+; GFX942DAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942DAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942DAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942DAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942DAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942DAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942DAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX942DAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942DAG-NEXT: s_endpgm
+;
+; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs:
+; GFX942GSEL: ; %bb.0: ; %bb
+; GFX942GSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942GSEL-NEXT: ;;#ASMSTART
+; GFX942GSEL-NEXT: ; def v0
+; GFX942GSEL-NEXT: ;;#ASMEND
+; GFX942GSEL-NEXT: v_mov_b32_e32 v32, 0
+; GFX942GSEL-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX942GSEL-NEXT: v_mov_b32_e32 v34, 2.0
+; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942GSEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX942GSEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX942GSEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX942GSEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX942GSEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX942GSEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX942GSEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX942GSEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX942GSEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31]
+; GFX942GSEL-NEXT: s_nop 15
+; GFX942GSEL-NEXT: s_nop 1
+; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942GSEL-NEXT: s_endpgm
bb:
%acc = call i32 asm sideeffect "; def $0", "={v0}"()
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -687,40 +1487,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg)
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
; GFX908-NEXT: global_store_dwordx4 v40, v[4:7], s[34:35] offset:112
; GFX908-NEXT: global_store_dwordx4 v40, v[8:11], s[34:35] offset:64
; GFX908-NEXT: global_store_dwordx4 v40, v[12:15], s[34:35] offset:80
@@ -729,6 +1529,205 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg)
; GFX908-NEXT: global_store_dwordx4 v40, v[24:27], s[34:35]
; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:16
; GFX908-NEXT: s_endpgm
+;
+; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_call:
+; GFX90ADAG: ; %bb.0: ; %bb
+; GFX90ADAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX90ADAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX90ADAG-NEXT: s_mov_b32 s38, -1
+; GFX90ADAG-NEXT: s_mov_b32 s39, 0xe00000
+; GFX90ADAG-NEXT: s_add_u32 s36, s36, s11
+; GFX90ADAG-NEXT: s_addc_u32 s37, s37, 0
+; GFX90ADAG-NEXT: s_mov_b32 s12, s8
+; GFX90ADAG-NEXT: s_add_u32 s8, s4, 44
+; GFX90ADAG-NEXT: s_mov_b32 s13, s9
+; GFX90ADAG-NEXT: s_addc_u32 s9, s5, 0
+; GFX90ADAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90ADAG-NEXT: s_getpc_b64 s[4:5]
+; GFX90ADAG-NEXT: s_add_u32 s4, s4, foo at gotpcrel32@lo+4
+; GFX90ADAG-NEXT: s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
+; GFX90ADAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX90ADAG-NEXT: s_mov_b32 s14, s10
+; GFX90ADAG-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX90ADAG-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX90ADAG-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX90ADAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX90ADAG-NEXT: v_mov_b32_e32 v31, v0
+; GFX90ADAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX90ADAG-NEXT: s_mov_b32 s32, 0
+; GFX90ADAG-NEXT: v_mov_b32_e32 v40, 0
+; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90ADAG-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX90ADAG-NEXT: global_load_dwordx4 a[28:31], v40, s[34:35] offset:112
+; GFX90ADAG-NEXT: global_load_dwordx4 a[24:27], v40, s[34:35] offset:96
+; GFX90ADAG-NEXT: global_load_dwordx4 a[20:23], v40, s[34:35] offset:80
+; GFX90ADAG-NEXT: global_load_dwordx4 a[16:19], v40, s[34:35] offset:64
+; GFX90ADAG-NEXT: global_load_dwordx4 a[12:15], v40, s[34:35] offset:48
+; GFX90ADAG-NEXT: global_load_dwordx4 a[8:11], v40, s[34:35] offset:32
+; GFX90ADAG-NEXT: global_load_dwordx4 a[4:7], v40, s[34:35] offset:16
+; GFX90ADAG-NEXT: global_load_dwordx4 a[0:3], v40, s[34:35]
+; GFX90ADAG-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90ADAG-NEXT: s_waitcnt vmcnt(0)
+; GFX90ADAG-NEXT: s_nop 0
+; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90ADAG-NEXT: s_nop 15
+; GFX90ADAG-NEXT: s_nop 2
+; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[24:27], s[34:35] offset:96
+; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[28:31], s[34:35] offset:112
+; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[16:19], s[34:35] offset:64
+; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[20:23], s[34:35] offset:80
+; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[8:11], s[34:35] offset:32
+; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[12:15], s[34:35] offset:48
+; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[0:3], s[34:35]
+; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[4:7], s[34:35] offset:16
+; GFX90ADAG-NEXT: s_endpgm
+;
+; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_call:
+; GFX90AGSEL: ; %bb.0: ; %bb
+; GFX90AGSEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX90AGSEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX90AGSEL-NEXT: s_mov_b32 s38, -1
+; GFX90AGSEL-NEXT: s_mov_b32 s39, 0xe00000
+; GFX90AGSEL-NEXT: s_add_u32 s36, s36, s11
+; GFX90AGSEL-NEXT: s_addc_u32 s37, s37, 0
+; GFX90AGSEL-NEXT: s_mov_b32 s16, s8
+; GFX90AGSEL-NEXT: s_add_u32 s8, s4, 44
+; GFX90AGSEL-NEXT: s_mov_b32 s15, s9
+; GFX90AGSEL-NEXT: s_addc_u32 s9, s5, 0
+; GFX90AGSEL-NEXT: s_mov_b64 s[12:13], s[0:1]
+; GFX90AGSEL-NEXT: s_getpc_b64 s[0:1]
+; GFX90AGSEL-NEXT: s_add_u32 s0, s0, foo at gotpcrel32@lo+4
+; GFX90AGSEL-NEXT: s_addc_u32 s1, s1, foo at gotpcrel32@hi+12
+; GFX90AGSEL-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x0
+; GFX90AGSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90AGSEL-NEXT: s_mov_b32 s14, s10
+; GFX90AGSEL-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX90AGSEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX90AGSEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX90AGSEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX90AGSEL-NEXT: s_mov_b64 s[4:5], s[12:13]
+; GFX90AGSEL-NEXT: s_mov_b32 s12, s16
+; GFX90AGSEL-NEXT: s_mov_b32 s13, s15
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v31, v0
+; GFX90AGSEL-NEXT: s_mov_b32 s32, 0
+; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90AGSEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[34:35]
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[34:35] offset:16
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[34:35] offset:32
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[34:35] offset:48
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[34:35] offset:64
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[34:35] offset:80
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[34:35] offset:96
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[34:35] offset:112
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0)
+; GFX90AGSEL-NEXT: s_nop 0
+; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GFX90AGSEL-NEXT: s_nop 15
+; GFX90AGSEL-NEXT: s_nop 2
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
+; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
+; GFX90AGSEL-NEXT: s_endpgm
+;
+; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_call:
+; GFX942DAG: ; %bb.0: ; %bb
+; GFX942DAG-NEXT: s_mov_b32 s12, s8
+; GFX942DAG-NEXT: s_add_u32 s8, s4, 44
+; GFX942DAG-NEXT: s_mov_b32 s13, s9
+; GFX942DAG-NEXT: s_addc_u32 s9, s5, 0
+; GFX942DAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942DAG-NEXT: s_getpc_b64 s[4:5]
+; GFX942DAG-NEXT: s_add_u32 s4, s4, foo at gotpcrel32@lo+4
+; GFX942DAG-NEXT: s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
+; GFX942DAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942DAG-NEXT: s_mov_b32 s14, s10
+; GFX942DAG-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX942DAG-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX942DAG-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX942DAG-NEXT: v_mov_b32_e32 v31, v0
+; GFX942DAG-NEXT: s_mov_b32 s32, 0
+; GFX942DAG-NEXT: v_mov_b32_e32 v40, 0
+; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942DAG-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX942DAG-NEXT: global_load_dwordx4 a[28:31], v40, s[34:35] offset:112
+; GFX942DAG-NEXT: global_load_dwordx4 a[24:27], v40, s[34:35] offset:96
+; GFX942DAG-NEXT: global_load_dwordx4 a[20:23], v40, s[34:35] offset:80
+; GFX942DAG-NEXT: global_load_dwordx4 a[16:19], v40, s[34:35] offset:64
+; GFX942DAG-NEXT: global_load_dwordx4 a[12:15], v40, s[34:35] offset:48
+; GFX942DAG-NEXT: global_load_dwordx4 a[8:11], v40, s[34:35] offset:32
+; GFX942DAG-NEXT: global_load_dwordx4 a[4:7], v40, s[34:35] offset:16
+; GFX942DAG-NEXT: global_load_dwordx4 a[0:3], v40, s[34:35]
+; GFX942DAG-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942DAG-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942DAG-NEXT: s_waitcnt vmcnt(0)
+; GFX942DAG-NEXT: s_nop 0
+; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942DAG-NEXT: s_nop 15
+; GFX942DAG-NEXT: s_nop 1
+; GFX942DAG-NEXT: global_store_dwordx4 v40, a[24:27], s[34:35] offset:96
+; GFX942DAG-NEXT: global_store_dwordx4 v40, a[28:31], s[34:35] offset:112
+; GFX942DAG-NEXT: global_store_dwordx4 v40, a[16:19], s[34:35] offset:64
+; GFX942DAG-NEXT: global_store_dwordx4 v40, a[20:23], s[34:35] offset:80
+; GFX942DAG-NEXT: global_store_dwordx4 v40, a[8:11], s[34:35] offset:32
+; GFX942DAG-NEXT: global_store_dwordx4 v40, a[12:15], s[34:35] offset:48
+; GFX942DAG-NEXT: global_store_dwordx4 v40, a[0:3], s[34:35]
+; GFX942DAG-NEXT: global_store_dwordx4 v40, a[4:7], s[34:35] offset:16
+; GFX942DAG-NEXT: s_endpgm
+;
+; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_call:
+; GFX942GSEL: ; %bb.0: ; %bb
+; GFX942GSEL-NEXT: s_mov_b32 s12, s8
+; GFX942GSEL-NEXT: s_add_u32 s8, s4, 44
+; GFX942GSEL-NEXT: s_mov_b32 s13, s9
+; GFX942GSEL-NEXT: s_addc_u32 s9, s5, 0
+; GFX942GSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942GSEL-NEXT: s_getpc_b64 s[4:5]
+; GFX942GSEL-NEXT: s_add_u32 s4, s4, foo at gotpcrel32@lo+4
+; GFX942GSEL-NEXT: s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
+; GFX942GSEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942GSEL-NEXT: s_mov_b32 s14, s10
+; GFX942GSEL-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX942GSEL-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX942GSEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX942GSEL-NEXT: v_mov_b32_e32 v31, v0
+; GFX942GSEL-NEXT: s_mov_b32 s32, 0
+; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942GSEL-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942GSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[34:35]
+; GFX942GSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[34:35] offset:16
+; GFX942GSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[34:35] offset:32
+; GFX942GSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[34:35] offset:48
+; GFX942GSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[34:35] offset:64
+; GFX942GSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[34:35] offset:80
+; GFX942GSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[34:35] offset:96
+; GFX942GSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[34:35] offset:112
+; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX942GSEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942GSEL-NEXT: s_nop 0
+; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
+; GFX942GSEL-NEXT: s_nop 15
+; GFX942GSEL-NEXT: s_nop 1
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
+; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
+; GFX942GSEL-NEXT: s_endpgm
bb:
call void @foo()
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -831,59 +1830,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v6, v3, a[0:31] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a25
; GFX908-NEXT: v_accvgpr_read_b32 v3, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a27
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a29
; GFX908-NEXT: v_accvgpr_read_b32 v3, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a31
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:112
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a17
; GFX908-NEXT: v_accvgpr_read_b32 v3, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a19
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:64
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a21
; GFX908-NEXT: v_accvgpr_read_b32 v3, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a23
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:80
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a9
; GFX908-NEXT: v_accvgpr_read_b32 v3, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a11
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:32
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a13
; GFX908-NEXT: v_accvgpr_read_b32 v3, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a15
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:48
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a3
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a5
; GFX908-NEXT: v_accvgpr_read_b32 v3, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a7
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:16
; GFX908-NEXT: s_cbranch_scc1 .LBB6_2
@@ -906,6 +1905,331 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX908-NEXT: .LBB6_2: ; %bb3
; GFX908-NEXT: s_endpgm
+;
+; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb:
+; GFX90ADAG: ; %bb.0: ; %bb1
+; GFX90ADAG-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; GFX90ADAG-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; GFX90ADAG-NEXT: s_mov_b32 s54, -1
+; GFX90ADAG-NEXT: s_mov_b32 s55, 0xe00000
+; GFX90ADAG-NEXT: s_add_u32 s52, s52, s11
+; GFX90ADAG-NEXT: s_mov_b32 s14, s10
+; GFX90ADAG-NEXT: s_mov_b32 s12, s8
+; GFX90ADAG-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX90ADAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90ADAG-NEXT: s_load_dword s8, s[4:5], 0x2c
+; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX90ADAG-NEXT: v_mov_b32_e32 v3, 2.0
+; GFX90ADAG-NEXT: s_addc_u32 s53, s53, 0
+; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90ADAG-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0
+; GFX90ADAG-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x40
+; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX90ADAG-NEXT: s_bitcmp0_b32 s8, 0
+; GFX90ADAG-NEXT: s_mov_b32 s32, 0
+; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a0, s36
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a1, s37
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a2, s38
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a3, s39
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a4, s40
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a5, s41
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a6, s42
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a7, s43
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a8, s44
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a9, s45
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a10, s46
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a11, s47
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a12, s48
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a13, s49
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a14, s50
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a15, s51
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a16, s16
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a17, s17
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a18, s18
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a19, s19
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a20, s20
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a21, s21
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a22, s22
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a23, s23
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a24, s24
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a25, s25
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a26, s26
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a27, s27
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a28, s28
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a29, s29
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a30, s30
+; GFX90ADAG-NEXT: v_accvgpr_write_b32 a31, s31
+; GFX90ADAG-NEXT: s_nop 1
+; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX90ADAG-NEXT: s_nop 15
+; GFX90ADAG-NEXT: s_nop 2
+; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[24:27], s[6:7] offset:96
+; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[28:31], s[6:7] offset:112
+; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[16:19], s[6:7] offset:64
+; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[20:23], s[6:7] offset:80
+; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[8:11], s[6:7] offset:32
+; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[12:15], s[6:7] offset:48
+; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[4:7], s[6:7] offset:16
+; GFX90ADAG-NEXT: s_cbranch_scc1 .LBB6_2
+; GFX90ADAG-NEXT: ; %bb.1: ; %bb2
+; GFX90ADAG-NEXT: s_add_u32 s8, s4, 48
+; GFX90ADAG-NEXT: s_mov_b32 s13, s9
+; GFX90ADAG-NEXT: s_addc_u32 s9, s5, 0
+; GFX90ADAG-NEXT: s_getpc_b64 s[4:5]
+; GFX90ADAG-NEXT: s_add_u32 s4, s4, foo at gotpcrel32@lo+4
+; GFX90ADAG-NEXT: s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
+; GFX90ADAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX90ADAG-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX90ADAG-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX90ADAG-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GFX90ADAG-NEXT: v_mov_b32_e32 v31, v0
+; GFX90ADAG-NEXT: s_mov_b64 s[2:3], s[54:55]
+; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90ADAG-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX90ADAG-NEXT: .LBB6_2: ; %bb3
+; GFX90ADAG-NEXT: s_endpgm
+;
+; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb:
+; GFX90AGSEL: ; %bb.0: ; %bb1
+; GFX90AGSEL-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0
+; GFX90AGSEL-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1
+; GFX90AGSEL-NEXT: s_mov_b32 s70, -1
+; GFX90AGSEL-NEXT: s_mov_b32 s71, 0xe00000
+; GFX90AGSEL-NEXT: s_add_u32 s68, s68, s11
+; GFX90AGSEL-NEXT: s_mov_b32 s14, s10
+; GFX90AGSEL-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX90AGSEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX90AGSEL-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GFX90AGSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90AGSEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90AGSEL-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0
+; GFX90AGSEL-NEXT: s_load_dwordx16 s[52:67], s[0:1], 0x40
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX90AGSEL-NEXT: s_addc_u32 s69, s69, 0
+; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a0, s36
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a16, s52
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a1, s37
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a2, s38
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a3, s39
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a4, s40
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a5, s41
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a6, s42
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a7, s43
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a8, s44
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a9, s45
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a10, s46
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a11, s47
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a12, s48
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a13, s49
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a14, s50
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a15, s51
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a17, s53
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a18, s54
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a19, s55
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a20, s56
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a21, s57
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a22, s58
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a23, s59
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a24, s60
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a25, s61
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a26, s62
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a27, s63
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a28, s64
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a29, s65
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a30, s66
+; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a31, s67
+; GFX90AGSEL-NEXT: s_xor_b32 s2, s2, 1
+; GFX90AGSEL-NEXT: s_and_b32 s2, s2, 1
+; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX90AGSEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX90AGSEL-NEXT: s_mov_b32 s32, 0
+; GFX90AGSEL-NEXT: s_nop 15
+; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
+; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[4:7], s[0:1] offset:16
+; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[8:11], s[0:1] offset:32
+; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[12:15], s[0:1] offset:48
+; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[16:19], s[0:1] offset:64
+; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[20:23], s[0:1] offset:80
+; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[24:27], s[0:1] offset:96
+; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[28:31], s[0:1] offset:112
+; GFX90AGSEL-NEXT: s_cbranch_scc1 .LBB6_2
+; GFX90AGSEL-NEXT: ; %bb.1: ; %bb2
+; GFX90AGSEL-NEXT: s_getpc_b64 s[0:1]
+; GFX90AGSEL-NEXT: s_add_u32 s0, s0, foo at gotpcrel32@lo+4
+; GFX90AGSEL-NEXT: s_addc_u32 s1, s1, foo at gotpcrel32@hi+12
+; GFX90AGSEL-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x0
+; GFX90AGSEL-NEXT: s_mov_b32 s12, s8
+; GFX90AGSEL-NEXT: s_add_u32 s8, s4, 48
+; GFX90AGSEL-NEXT: s_mov_b64 s[0:1], s[68:69]
+; GFX90AGSEL-NEXT: s_mov_b32 s13, s9
+; GFX90AGSEL-NEXT: s_addc_u32 s9, s5, 0
+; GFX90AGSEL-NEXT: s_mov_b64 s[2:3], s[70:71]
+; GFX90AGSEL-NEXT: s_mov_b64 s[4:5], s[16:17]
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v31, v0
+; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90AGSEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX90AGSEL-NEXT: .LBB6_2: ; %bb3
+; GFX90AGSEL-NEXT: s_endpgm
+;
+; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb:
+; GFX942DAG: ; %bb.0: ; %bb1
+; GFX942DAG-NEXT: s_mov_b32 s14, s10
+; GFX942DAG-NEXT: s_mov_b32 s12, s8
+; GFX942DAG-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX942DAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942DAG-NEXT: s_load_dword s8, s[4:5], 0x2c
+; GFX942DAG-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX942DAG-NEXT: v_mov_b32_e32 v3, 2.0
+; GFX942DAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942DAG-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0
+; GFX942DAG-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x40
+; GFX942DAG-NEXT: s_bitcmp0_b32 s8, 0
+; GFX942DAG-NEXT: s_mov_b32 s32, 0
+; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a0, s36
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a1, s37
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a2, s38
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a3, s39
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a4, s40
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a5, s41
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a6, s42
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a7, s43
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a8, s44
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a9, s45
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a10, s46
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a11, s47
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a12, s48
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a13, s49
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a14, s50
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a15, s51
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a16, s16
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a17, s17
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a18, s18
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a19, s19
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a20, s20
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a21, s21
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a22, s22
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a23, s23
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a24, s24
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a25, s25
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a26, s26
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a27, s27
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a28, s28
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a29, s29
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a30, s30
+; GFX942DAG-NEXT: v_accvgpr_write_b32 a31, s31
+; GFX942DAG-NEXT: s_nop 1
+; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v2, v3, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942DAG-NEXT: s_nop 15
+; GFX942DAG-NEXT: s_nop 1
+; GFX942DAG-NEXT: global_store_dwordx4 v1, a[24:27], s[6:7] offset:96
+; GFX942DAG-NEXT: global_store_dwordx4 v1, a[28:31], s[6:7] offset:112
+; GFX942DAG-NEXT: global_store_dwordx4 v1, a[16:19], s[6:7] offset:64
+; GFX942DAG-NEXT: global_store_dwordx4 v1, a[20:23], s[6:7] offset:80
+; GFX942DAG-NEXT: global_store_dwordx4 v1, a[8:11], s[6:7] offset:32
+; GFX942DAG-NEXT: global_store_dwordx4 v1, a[12:15], s[6:7] offset:48
+; GFX942DAG-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX942DAG-NEXT: global_store_dwordx4 v1, a[4:7], s[6:7] offset:16
+; GFX942DAG-NEXT: s_cbranch_scc1 .LBB6_2
+; GFX942DAG-NEXT: ; %bb.1: ; %bb2
+; GFX942DAG-NEXT: s_add_u32 s8, s4, 48
+; GFX942DAG-NEXT: s_mov_b32 s13, s9
+; GFX942DAG-NEXT: s_addc_u32 s9, s5, 0
+; GFX942DAG-NEXT: s_getpc_b64 s[4:5]
+; GFX942DAG-NEXT: s_add_u32 s4, s4, foo at gotpcrel32@lo+4
+; GFX942DAG-NEXT: s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
+; GFX942DAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942DAG-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX942DAG-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX942DAG-NEXT: v_mov_b32_e32 v31, v0
+; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942DAG-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX942DAG-NEXT: .LBB6_2: ; %bb3
+; GFX942DAG-NEXT: s_endpgm
+;
+; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb:
+; GFX942GSEL: ; %bb.0: ; %bb1
+; GFX942GSEL-NEXT: s_mov_b32 s14, s10
+; GFX942GSEL-NEXT: s_mov_b32 s12, s8
+; GFX942GSEL-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX942GSEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942GSEL-NEXT: s_load_dword s8, s[4:5], 0x2c
+; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942GSEL-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x0
+; GFX942GSEL-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x40
+; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX942GSEL-NEXT: s_xor_b32 s8, s8, 1
+; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a0, s16
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a16, s36
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a1, s17
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a2, s18
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a3, s19
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a4, s20
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a5, s21
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a6, s22
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a7, s23
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a8, s24
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a9, s25
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a10, s26
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a11, s27
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a12, s28
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a13, s29
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a14, s30
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a15, s31
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a17, s37
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a18, s38
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a19, s39
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a20, s40
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a21, s41
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a22, s42
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a23, s43
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a24, s44
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a25, s45
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a26, s46
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a27, s47
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a28, s48
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a29, s49
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a30, s50
+; GFX942GSEL-NEXT: v_accvgpr_write_b32 a31, s51
+; GFX942GSEL-NEXT: s_and_b32 s8, s8, 1
+; GFX942GSEL-NEXT: s_cmp_lg_u32 s8, 0
+; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX942GSEL-NEXT: s_mov_b32 s32, 0
+; GFX942GSEL-NEXT: s_nop 15
+; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[4:7], s[6:7] offset:16
+; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[8:11], s[6:7] offset:32
+; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[12:15], s[6:7] offset:48
+; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[16:19], s[6:7] offset:64
+; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[20:23], s[6:7] offset:80
+; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[24:27], s[6:7] offset:96
+; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[28:31], s[6:7] offset:112
+; GFX942GSEL-NEXT: s_cbranch_scc1 .LBB6_2
+; GFX942GSEL-NEXT: ; %bb.1: ; %bb2
+; GFX942GSEL-NEXT: s_getpc_b64 s[6:7]
+; GFX942GSEL-NEXT: s_add_u32 s6, s6, foo at gotpcrel32@lo+4
+; GFX942GSEL-NEXT: s_addc_u32 s7, s7, foo at gotpcrel32@hi+12
+; GFX942GSEL-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX942GSEL-NEXT: s_add_u32 s8, s4, 48
+; GFX942GSEL-NEXT: s_mov_b32 s13, s9
+; GFX942GSEL-NEXT: s_addc_u32 s9, s5, 0
+; GFX942GSEL-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX942GSEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX942GSEL-NEXT: v_mov_b32_e32 v31, v0
+; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942GSEL-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX942GSEL-NEXT: .LBB6_2: ; %bb3
+; GFX942GSEL-NEXT: s_endpgm
bb1:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
@@ -972,40 +2296,40 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a25
; GFX908-NEXT: v_accvgpr_read_b32 v2, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a27
; GFX908-NEXT: v_accvgpr_read_b32 v6, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a31
; GFX908-NEXT: v_accvgpr_read_b32 v10, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a19
; GFX908-NEXT: v_accvgpr_read_b32 v14, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a23
; GFX908-NEXT: v_accvgpr_read_b32 v18, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a11
; GFX908-NEXT: v_accvgpr_read_b32 v22, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a15
; GFX908-NEXT: v_accvgpr_read_b32 v26, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a3
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a5
; GFX908-NEXT: v_accvgpr_read_b32 v2, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a7
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80
@@ -1015,6 +2339,122 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr:
+; GFX90ADAG: ; %bb.0: ; %bb
+; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90ADAG-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
+; GFX90ADAG-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
+; GFX90ADAG-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
+; GFX90ADAG-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
+; GFX90ADAG-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
+; GFX90ADAG-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
+; GFX90ADAG-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX90ADAG-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
+; GFX90ADAG-NEXT: v_mov_b32_e32 v34, 1.0
+; GFX90ADAG-NEXT: v_mov_b32_e32 v35, 2.0
+; GFX90ADAG-NEXT: s_waitcnt vmcnt(0)
+; GFX90ADAG-NEXT: s_nop 0
+; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v34, v35, v[2:33]
+; GFX90ADAG-NEXT: s_nop 15
+; GFX90ADAG-NEXT: s_nop 2
+; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[26:29], off offset:96
+; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:112
+; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:64
+; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:80
+; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32
+; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48
+; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX90ADAG-NEXT: s_waitcnt vmcnt(0)
+; GFX90ADAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr:
+; GFX90AGSEL: ; %bb.0: ; %bb
+; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90AGSEL-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
+; GFX90AGSEL-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX90AGSEL-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
+; GFX90AGSEL-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
+; GFX90AGSEL-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
+; GFX90AGSEL-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
+; GFX90AGSEL-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
+; GFX90AGSEL-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v34, 1.0
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v35, 2.0
+; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0)
+; GFX90AGSEL-NEXT: s_nop 0
+; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v34, v35, v[2:33]
+; GFX90AGSEL-NEXT: s_nop 15
+; GFX90AGSEL-NEXT: s_nop 2
+; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32
+; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48
+; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:64
+; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:80
+; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[26:29], off offset:96
+; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:112
+; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0)
+; GFX90AGSEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr:
+; GFX942DAG: ; %bb.0: ; %bb
+; GFX942DAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942DAG-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
+; GFX942DAG-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
+; GFX942DAG-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
+; GFX942DAG-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
+; GFX942DAG-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
+; GFX942DAG-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
+; GFX942DAG-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX942DAG-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
+; GFX942DAG-NEXT: v_mov_b32_e32 v34, 1.0
+; GFX942DAG-NEXT: v_mov_b32_e32 v35, 2.0
+; GFX942DAG-NEXT: s_waitcnt vmcnt(0)
+; GFX942DAG-NEXT: s_nop 0
+; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v34, v35, v[2:33]
+; GFX942DAG-NEXT: s_nop 15
+; GFX942DAG-NEXT: s_nop 1
+; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[26:29], off offset:96
+; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:112
+; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:64
+; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:80
+; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32
+; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48
+; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX942DAG-NEXT: s_waitcnt vmcnt(0)
+; GFX942DAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr:
+; GFX942GSEL: ; %bb.0: ; %bb
+; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942GSEL-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
+; GFX942GSEL-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX942GSEL-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
+; GFX942GSEL-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
+; GFX942GSEL-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
+; GFX942GSEL-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
+; GFX942GSEL-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
+; GFX942GSEL-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
+; GFX942GSEL-NEXT: v_mov_b32_e32 v34, 1.0
+; GFX942GSEL-NEXT: v_mov_b32_e32 v35, 2.0
+; GFX942GSEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942GSEL-NEXT: s_nop 0
+; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v34, v35, v[2:33]
+; GFX942GSEL-NEXT: s_nop 15
+; GFX942GSEL-NEXT: s_nop 1
+; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32
+; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48
+; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:64
+; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:80
+; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[26:29], off offset:96
+; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:112
+; GFX942GSEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942GSEL-NEXT: s_setpc_b64 s[30:31]
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -1073,40 +2513,40 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg)
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a25
; GFX908-NEXT: v_accvgpr_read_b32 v2, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a27
; GFX908-NEXT: v_accvgpr_read_b32 v6, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a31
; GFX908-NEXT: v_accvgpr_read_b32 v10, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a19
; GFX908-NEXT: v_accvgpr_read_b32 v14, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a23
; GFX908-NEXT: v_accvgpr_read_b32 v18, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a11
; GFX908-NEXT: v_accvgpr_read_b32 v22, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a15
; GFX908-NEXT: v_accvgpr_read_b32 v26, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a3
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a5
; GFX908-NEXT: v_accvgpr_read_b32 v2, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a7
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80
@@ -1116,6 +2556,122 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg)
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr:
+; GFX90ADAG: ; %bb.0: ; %bb
+; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90ADAG-NEXT: global_load_dwordx4 a[28:31], v[0:1], off offset:112
+; GFX90ADAG-NEXT: global_load_dwordx4 a[24:27], v[0:1], off offset:96
+; GFX90ADAG-NEXT: global_load_dwordx4 a[20:23], v[0:1], off offset:80
+; GFX90ADAG-NEXT: global_load_dwordx4 a[16:19], v[0:1], off offset:64
+; GFX90ADAG-NEXT: global_load_dwordx4 a[12:15], v[0:1], off offset:48
+; GFX90ADAG-NEXT: global_load_dwordx4 a[8:11], v[0:1], off offset:32
+; GFX90ADAG-NEXT: global_load_dwordx4 a[4:7], v[0:1], off offset:16
+; GFX90ADAG-NEXT: global_load_dwordx4 a[0:3], v[0:1], off
+; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX90ADAG-NEXT: v_mov_b32_e32 v3, 2.0
+; GFX90ADAG-NEXT: s_waitcnt vmcnt(0)
+; GFX90ADAG-NEXT: s_nop 0
+; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GFX90ADAG-NEXT: s_nop 15
+; GFX90ADAG-NEXT: s_nop 2
+; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[24:27], off offset:96
+; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off offset:112
+; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[16:19], off offset:64
+; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[20:23], off offset:80
+; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[8:11], off offset:32
+; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off offset:48
+; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[0:3], off
+; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[4:7], off offset:16
+; GFX90ADAG-NEXT: s_waitcnt vmcnt(0)
+; GFX90ADAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr:
+; GFX90AGSEL: ; %bb.0: ; %bb
+; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[0:3], v[0:1], off
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[4:7], v[0:1], off offset:16
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[8:11], v[0:1], off offset:32
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[12:15], v[0:1], off offset:48
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[16:19], v[0:1], off offset:64
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[20:23], v[0:1], off offset:80
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[24:27], v[0:1], off offset:96
+; GFX90AGSEL-NEXT: global_load_dwordx4 a[28:31], v[0:1], off offset:112
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX90AGSEL-NEXT: v_mov_b32_e32 v3, 2.0
+; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0)
+; GFX90AGSEL-NEXT: s_nop 0
+; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GFX90AGSEL-NEXT: s_nop 15
+; GFX90AGSEL-NEXT: s_nop 2
+; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[0:3], off
+; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[4:7], off offset:16
+; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[8:11], off offset:32
+; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[12:15], off offset:48
+; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off offset:64
+; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[20:23], off offset:80
+; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[24:27], off offset:96
+; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[28:31], off offset:112
+; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0)
+; GFX90AGSEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr:
+; GFX942DAG: ; %bb.0: ; %bb
+; GFX942DAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942DAG-NEXT: global_load_dwordx4 a[28:31], v[0:1], off offset:112
+; GFX942DAG-NEXT: global_load_dwordx4 a[24:27], v[0:1], off offset:96
+; GFX942DAG-NEXT: global_load_dwordx4 a[20:23], v[0:1], off offset:80
+; GFX942DAG-NEXT: global_load_dwordx4 a[16:19], v[0:1], off offset:64
+; GFX942DAG-NEXT: global_load_dwordx4 a[12:15], v[0:1], off offset:48
+; GFX942DAG-NEXT: global_load_dwordx4 a[8:11], v[0:1], off offset:32
+; GFX942DAG-NEXT: global_load_dwordx4 a[4:7], v[0:1], off offset:16
+; GFX942DAG-NEXT: global_load_dwordx4 a[0:3], v[0:1], off
+; GFX942DAG-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX942DAG-NEXT: v_mov_b32_e32 v3, 2.0
+; GFX942DAG-NEXT: s_waitcnt vmcnt(0)
+; GFX942DAG-NEXT: s_nop 0
+; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v2, v3, a[0:31]
+; GFX942DAG-NEXT: s_nop 15
+; GFX942DAG-NEXT: s_nop 1
+; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[24:27], off offset:96
+; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off offset:112
+; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[16:19], off offset:64
+; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[20:23], off offset:80
+; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[8:11], off offset:32
+; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off offset:48
+; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[0:3], off
+; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[4:7], off offset:16
+; GFX942DAG-NEXT: s_waitcnt vmcnt(0)
+; GFX942DAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr:
+; GFX942GSEL: ; %bb.0: ; %bb
+; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942GSEL-NEXT: global_load_dwordx4 a[0:3], v[0:1], off
+; GFX942GSEL-NEXT: global_load_dwordx4 a[4:7], v[0:1], off offset:16
+; GFX942GSEL-NEXT: global_load_dwordx4 a[8:11], v[0:1], off offset:32
+; GFX942GSEL-NEXT: global_load_dwordx4 a[12:15], v[0:1], off offset:48
+; GFX942GSEL-NEXT: global_load_dwordx4 a[16:19], v[0:1], off offset:64
+; GFX942GSEL-NEXT: global_load_dwordx4 a[20:23], v[0:1], off offset:80
+; GFX942GSEL-NEXT: global_load_dwordx4 a[24:27], v[0:1], off offset:96
+; GFX942GSEL-NEXT: global_load_dwordx4 a[28:31], v[0:1], off offset:112
+; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX942GSEL-NEXT: v_mov_b32_e32 v3, 2.0
+; GFX942GSEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942GSEL-NEXT: s_nop 0
+; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v2, v3, a[0:31]
+; GFX942GSEL-NEXT: s_nop 15
+; GFX942GSEL-NEXT: s_nop 1
+; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[0:3], off
+; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[4:7], off offset:16
+; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[8:11], off offset:32
+; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[12:15], off offset:48
+; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off offset:64
+; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[20:23], off offset:80
+; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[24:27], off offset:96
+; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[28:31], off offset:112
+; GFX942GSEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942GSEL-NEXT: s_setpc_b64 s[30:31]
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -1130,5 +2686,5 @@ attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2
attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0" }
attributes #3 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
; GFX90A: {{.*}}
+; GFX942: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 0af655dfbbee9..d444db8cd1bdf 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -54,49 +54,64 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: s_cbranch_scc1 .LBB0_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: s_nop 13
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_nop 12
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
-; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_zeroinit:
@@ -285,49 +300,64 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX908-NEXT: s_cbranch_scc1 .LBB1_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: s_nop 13
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_nop 12
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
-; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_unfoldable_splat:
@@ -512,53 +542,69 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: s_cbranch_scc1 .LBB2_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: s_nop 13
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_nop 12
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
-; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_non_splat:
; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
@@ -592,7 +638,6 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -618,6 +663,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
;
; GFX942-LABEL: test_mfma_loop_non_splat:
; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
@@ -651,7 +697,6 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -774,49 +819,64 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
; GFX908-NEXT: s_cbranch_scc1 .LBB3_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: s_nop 13
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_nop 12
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
-; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_unfoldable_seq:
@@ -1019,133 +1079,179 @@ exit:
define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
; GFX908-LABEL: test_mfma_loop_vgpr_init:
; GFX908: ; %bb.0: ; %entry
-; GFX908-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a1, v0
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: s_mov_b32 s0, 16
; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
; GFX908-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v2
; GFX908-NEXT: .LBB4_1: ; %for.cond.preheader
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GFX908-NEXT: s_add_i32 s0, s0, -1
; GFX908-NEXT: s_cmp_lg_u32 s0, 0
; GFX908-NEXT: s_cbranch_scc1 .LBB4_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: s_nop 13
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_nop 12
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
-; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_vgpr_init:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_and_b32_e32 v2, 0x3ff, v0
; GFX90A-NEXT: s_mov_b32 s0, 16
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v2
; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_nop 1
@@ -1170,42 +1276,42 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
;
; GFX942-LABEL: test_mfma_loop_vgpr_init:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0
; GFX942-NEXT: s_mov_b32 s0, 16
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a5, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a6, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a8, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a9, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a10, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a11, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a12, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a13, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a14, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a15, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a16, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a17, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a18, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a19, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a20, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a21, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a22, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a23, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a24, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a25, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a26, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a27, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a28, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a29, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a30, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a31, v2
; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_nop 1
@@ -1329,153 +1435,166 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX908-NEXT: s_cbranch_scc1 .LBB5_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: s_nop 13
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_nop 12
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
-; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_sgpr_init:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
-; GFX90A-NEXT: s_add_i32 s0, s0, -1
-; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
-; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1
-; GFX90A-NEXT: ; %bb.2: ; %exit
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_nop 12
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GFX90A-NEXT: s_endpgm
-;
-; GFX942-LABEL: test_mfma_loop_sgpr_init:
-; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
-; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s1
+; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0
+; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX90A-NEXT: ; %bb.2: ; %exit
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 12
+; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_loop_sgpr_init:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX942-NEXT: s_mov_b32 s0, 16
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_accvgpr_write_b32 a0, s1
+; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0
; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_nop 1
@@ -1596,60 +1715,72 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX908-NEXT: s_cbranch_scc1 .LBB6_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: s_nop 13
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_nop 12
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
-; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_mixed_init:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
-; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_and_b32_e32 v2, 0x3ff, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s1
; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
@@ -1679,9 +1810,11 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_nop 1
@@ -1707,12 +1840,9 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX942-LABEL: test_mfma_loop_mixed_init:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
-; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0
; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s1
; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
@@ -1742,9 +1872,11 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v2
; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_nop 1
@@ -1835,49 +1967,64 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar
; GFX908-NEXT: s_cbranch_scc1 .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: s_nop 13
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_nop 12
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
-; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_mfma_forward_init:
@@ -2040,49 +2187,64 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: s_cbranch_scc1 .LBB8_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: s_nop 13
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_nop 12
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
-; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_agpr_init:
@@ -2319,49 +2481,64 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX908-NEXT: s_cbranch_scc1 .LBB9_1
; GFX908-NEXT: ; %bb.4: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: s_nop 10
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_nop 9
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX908-NEXT: s_nop 0
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
-; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_nested_loop_zeroinit:
@@ -2822,8 +2999,8 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 {
; GFX908-NEXT: v_accvgpr_write_b32 a3, 0
; GFX908-NEXT: v_accvgpr_write_b32 a2, 0
; GFX908-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX908-NEXT: s_mov_b32 s4, 16
; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT: s_mov_b32 s4, 16
; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
; GFX908-NEXT: .LBB11_1: ; %for.cond.preheader
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2871,6 +3048,7 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 {
; GFX90A-LABEL: test_mfma_loop_non_splat_ret_use:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
@@ -2904,7 +3082,6 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
; GFX90A-NEXT: s_mov_b32 s4, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
; GFX90A-NEXT: .LBB11_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2952,6 +3129,7 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 {
; GFX942-LABEL: test_mfma_loop_non_splat_ret_use:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
@@ -2985,7 +3163,6 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 {
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
; GFX942-NEXT: .LBB11_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
index 51cd564bdece3..800eb9efa571e 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
@@ -95,123 +95,123 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v3, v0, a[0:31]
; GREEDY908-NEXT: s_nop 15
; GREEDY908-NEXT: s_nop 1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a32
-; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a61
-; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a60
-; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a33
-; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a59
-; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a58
-; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a32
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a33
; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a34
-; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a57
-; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a56
+; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v6
; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a35
-; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a55
-; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a54
-; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a36
-; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a53
-; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a52
-; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a35
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a36
; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a37
-; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a51
-; GREEDY908-NEXT: v_accvgpr_read_b32 v16, a50
+; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v6
; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a38
-; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a49
-; GREEDY908-NEXT: v_accvgpr_read_b32 v18, a48
-; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a39
-; GREEDY908-NEXT: v_accvgpr_read_b32 v19, a47
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a46
-; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a38
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a39
; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a40
-; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v19
+; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v6
; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a41
-; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v18
-; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v17
-; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a42
-; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v16
-; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v15
-; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a41
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a42
; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a43
-; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v14
-; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v13
+; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v6
; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a44
-; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v12
-; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v11
-; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a45
-; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v10
-; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v9
-; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v8
-; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v7
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a44
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a45
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a46
+; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a47
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a48
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a49
+; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a50
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a51
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a52
+; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a53
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a54
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a55
+; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a56
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a57
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a58
+; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a59
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a60
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a61
+; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2
; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v6
-; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v5
+; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1
; GREEDY908-NEXT: s_nop 0
; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; GREEDY908-NEXT: s_nop 15
; GREEDY908-NEXT: s_nop 1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a27
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a26
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a25
; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a24
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a25
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a26
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a27
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a31
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a30
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a29
; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a28
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a29
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a30
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a31
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a19
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a18
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a17
; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a16
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a17
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a18
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a19
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a23
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a22
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a21
; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a20
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a21
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a22
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a23
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9
; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13
; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1
; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5
; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; GREEDY908-NEXT: s_endpgm
@@ -499,105 +499,73 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; FAST90A-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0
; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x40
; FAST90A-NEXT: s_waitcnt lgkmcnt(0)
-; FAST90A-NEXT: v_accvgpr_write_b32 a32, s36
-; FAST90A-NEXT: v_accvgpr_write_b32 a33, s37
-; FAST90A-NEXT: v_accvgpr_write_b32 a34, s38
-; FAST90A-NEXT: v_accvgpr_write_b32 a35, s39
-; FAST90A-NEXT: v_accvgpr_write_b32 a36, s40
-; FAST90A-NEXT: v_accvgpr_write_b32 a37, s41
-; FAST90A-NEXT: v_accvgpr_write_b32 a38, s42
-; FAST90A-NEXT: v_accvgpr_write_b32 a39, s43
-; FAST90A-NEXT: v_accvgpr_write_b32 a40, s44
-; FAST90A-NEXT: v_accvgpr_write_b32 a41, s45
-; FAST90A-NEXT: v_accvgpr_write_b32 a42, s46
-; FAST90A-NEXT: v_accvgpr_write_b32 a43, s47
-; FAST90A-NEXT: v_accvgpr_write_b32 a44, s48
-; FAST90A-NEXT: v_accvgpr_write_b32 a45, s49
-; FAST90A-NEXT: v_accvgpr_write_b32 a46, s50
-; FAST90A-NEXT: v_accvgpr_write_b32 a47, s51
-; FAST90A-NEXT: v_accvgpr_write_b32 a48, s4
-; FAST90A-NEXT: v_accvgpr_write_b32 a49, s5
-; FAST90A-NEXT: v_accvgpr_write_b32 a50, s6
-; FAST90A-NEXT: v_accvgpr_write_b32 a51, s7
-; FAST90A-NEXT: v_accvgpr_write_b32 a52, s8
-; FAST90A-NEXT: v_accvgpr_write_b32 a53, s9
-; FAST90A-NEXT: v_accvgpr_write_b32 a54, s10
-; FAST90A-NEXT: v_accvgpr_write_b32 a55, s11
-; FAST90A-NEXT: v_accvgpr_write_b32 a56, s12
-; FAST90A-NEXT: v_accvgpr_write_b32 a57, s13
-; FAST90A-NEXT: v_accvgpr_write_b32 a58, s14
-; FAST90A-NEXT: v_accvgpr_write_b32 a59, s15
-; FAST90A-NEXT: v_accvgpr_write_b32 a60, s16
-; FAST90A-NEXT: v_accvgpr_write_b32 a61, s17
-; FAST90A-NEXT: v_accvgpr_write_b32 a62, s18
-; FAST90A-NEXT: v_accvgpr_write_b32 a63, s19
+; FAST90A-NEXT: v_accvgpr_write_b32 a0, s36
+; FAST90A-NEXT: v_accvgpr_write_b32 a1, s37
+; FAST90A-NEXT: v_accvgpr_write_b32 a2, s38
+; FAST90A-NEXT: v_accvgpr_write_b32 a3, s39
+; FAST90A-NEXT: v_accvgpr_write_b32 a4, s40
+; FAST90A-NEXT: v_accvgpr_write_b32 a5, s41
+; FAST90A-NEXT: v_accvgpr_write_b32 a6, s42
+; FAST90A-NEXT: v_accvgpr_write_b32 a7, s43
+; FAST90A-NEXT: v_accvgpr_write_b32 a8, s44
+; FAST90A-NEXT: v_accvgpr_write_b32 a9, s45
+; FAST90A-NEXT: v_accvgpr_write_b32 a10, s46
+; FAST90A-NEXT: v_accvgpr_write_b32 a11, s47
+; FAST90A-NEXT: v_accvgpr_write_b32 a12, s48
+; FAST90A-NEXT: v_accvgpr_write_b32 a13, s49
+; FAST90A-NEXT: v_accvgpr_write_b32 a14, s50
+; FAST90A-NEXT: v_accvgpr_write_b32 a15, s51
+; FAST90A-NEXT: v_accvgpr_write_b32 a16, s4
+; FAST90A-NEXT: v_accvgpr_write_b32 a17, s5
+; FAST90A-NEXT: v_accvgpr_write_b32 a18, s6
+; FAST90A-NEXT: v_accvgpr_write_b32 a19, s7
+; FAST90A-NEXT: v_accvgpr_write_b32 a20, s8
+; FAST90A-NEXT: v_accvgpr_write_b32 a21, s9
+; FAST90A-NEXT: v_accvgpr_write_b32 a22, s10
+; FAST90A-NEXT: v_accvgpr_write_b32 a23, s11
+; FAST90A-NEXT: v_accvgpr_write_b32 a24, s12
+; FAST90A-NEXT: v_accvgpr_write_b32 a25, s13
+; FAST90A-NEXT: v_accvgpr_write_b32 a26, s14
+; FAST90A-NEXT: v_accvgpr_write_b32 a27, s15
+; FAST90A-NEXT: v_accvgpr_write_b32 a28, s16
+; FAST90A-NEXT: v_accvgpr_write_b32 a29, s17
+; FAST90A-NEXT: v_accvgpr_write_b32 a30, s18
+; FAST90A-NEXT: v_accvgpr_write_b32 a31, s19
; FAST90A-NEXT: s_nop 1
-; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63]
+; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[0:31]
; FAST90A-NEXT: s_nop 15
; FAST90A-NEXT: s_nop 2
-; FAST90A-NEXT: v_accvgpr_read_b32 v3, a29
-; FAST90A-NEXT: v_accvgpr_read_b32 v4, a28
-; FAST90A-NEXT: v_accvgpr_read_b32 v5, a27
-; FAST90A-NEXT: v_accvgpr_read_b32 v6, a26
-; FAST90A-NEXT: v_accvgpr_read_b32 v7, a25
-; FAST90A-NEXT: v_accvgpr_read_b32 v8, a24
-; FAST90A-NEXT: v_accvgpr_read_b32 v9, a23
-; FAST90A-NEXT: v_accvgpr_read_b32 v10, a22
-; FAST90A-NEXT: v_accvgpr_read_b32 v11, a21
-; FAST90A-NEXT: v_accvgpr_read_b32 v12, a20
-; FAST90A-NEXT: v_accvgpr_read_b32 v13, a19
-; FAST90A-NEXT: v_accvgpr_read_b32 v14, a18
-; FAST90A-NEXT: v_accvgpr_read_b32 v15, a17
-; FAST90A-NEXT: v_accvgpr_read_b32 v16, a16
-; FAST90A-NEXT: v_accvgpr_read_b32 v17, a15
-; FAST90A-NEXT: v_accvgpr_read_b32 v18, a14
-; FAST90A-NEXT: v_accvgpr_read_b32 v19, a13
-; FAST90A-NEXT: v_accvgpr_read_b32 v20, a12
-; FAST90A-NEXT: v_accvgpr_read_b32 v21, a11
-; FAST90A-NEXT: v_accvgpr_read_b32 v22, a10
-; FAST90A-NEXT: v_accvgpr_read_b32 v23, a9
-; FAST90A-NEXT: v_accvgpr_read_b32 v24, a8
-; FAST90A-NEXT: v_accvgpr_read_b32 v25, a7
-; FAST90A-NEXT: v_accvgpr_read_b32 v26, a6
-; FAST90A-NEXT: v_accvgpr_read_b32 v27, a5
-; FAST90A-NEXT: v_accvgpr_read_b32 v28, a4
-; FAST90A-NEXT: v_accvgpr_read_b32 v29, a3
-; FAST90A-NEXT: v_accvgpr_read_b32 v30, a2
-; FAST90A-NEXT: v_accvgpr_read_b32 v31, a1
-; FAST90A-NEXT: v_accvgpr_read_b32 v32, a0
-; FAST90A-NEXT: v_accvgpr_mov_b32 a0, a32
-; FAST90A-NEXT: v_accvgpr_mov_b32 a1, a33
-; FAST90A-NEXT: v_accvgpr_write_b32 a2, v32
-; FAST90A-NEXT: v_accvgpr_write_b32 a3, v31
-; FAST90A-NEXT: v_accvgpr_write_b32 a4, v30
-; FAST90A-NEXT: v_accvgpr_write_b32 a5, v29
-; FAST90A-NEXT: v_accvgpr_write_b32 a6, v28
-; FAST90A-NEXT: v_accvgpr_write_b32 a7, v27
-; FAST90A-NEXT: v_accvgpr_write_b32 a8, v26
-; FAST90A-NEXT: v_accvgpr_write_b32 a9, v25
-; FAST90A-NEXT: v_accvgpr_write_b32 a10, v24
-; FAST90A-NEXT: v_accvgpr_write_b32 a11, v23
-; FAST90A-NEXT: v_accvgpr_write_b32 a12, v22
-; FAST90A-NEXT: v_accvgpr_write_b32 a13, v21
-; FAST90A-NEXT: v_accvgpr_write_b32 a14, v20
-; FAST90A-NEXT: v_accvgpr_write_b32 a15, v19
-; FAST90A-NEXT: v_accvgpr_write_b32 a16, v18
-; FAST90A-NEXT: v_accvgpr_write_b32 a17, v17
-; FAST90A-NEXT: v_accvgpr_write_b32 a18, v16
-; FAST90A-NEXT: v_accvgpr_write_b32 a19, v15
-; FAST90A-NEXT: v_accvgpr_write_b32 a20, v14
-; FAST90A-NEXT: v_accvgpr_write_b32 a21, v13
-; FAST90A-NEXT: v_accvgpr_write_b32 a22, v12
-; FAST90A-NEXT: v_accvgpr_write_b32 a23, v11
-; FAST90A-NEXT: v_accvgpr_write_b32 a24, v10
-; FAST90A-NEXT: v_accvgpr_write_b32 a25, v9
-; FAST90A-NEXT: v_accvgpr_write_b32 a26, v8
-; FAST90A-NEXT: v_accvgpr_write_b32 a27, v7
-; FAST90A-NEXT: v_accvgpr_write_b32 a28, v6
-; FAST90A-NEXT: v_accvgpr_write_b32 a29, v5
-; FAST90A-NEXT: v_accvgpr_write_b32 a30, v4
-; FAST90A-NEXT: v_accvgpr_write_b32 a31, v3
+; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a32
+; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a33
+; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a34
+; FAST90A-NEXT: v_accvgpr_mov_b32 a5, a35
+; FAST90A-NEXT: v_accvgpr_mov_b32 a6, a36
+; FAST90A-NEXT: v_accvgpr_mov_b32 a7, a37
+; FAST90A-NEXT: v_accvgpr_mov_b32 a8, a38
+; FAST90A-NEXT: v_accvgpr_mov_b32 a9, a39
+; FAST90A-NEXT: v_accvgpr_mov_b32 a10, a40
+; FAST90A-NEXT: v_accvgpr_mov_b32 a11, a41
+; FAST90A-NEXT: v_accvgpr_mov_b32 a12, a42
+; FAST90A-NEXT: v_accvgpr_mov_b32 a13, a43
+; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a44
+; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a45
+; FAST90A-NEXT: v_accvgpr_mov_b32 a16, a46
+; FAST90A-NEXT: v_accvgpr_mov_b32 a17, a47
+; FAST90A-NEXT: v_accvgpr_mov_b32 a18, a48
+; FAST90A-NEXT: v_accvgpr_mov_b32 a19, a49
+; FAST90A-NEXT: v_accvgpr_mov_b32 a20, a50
+; FAST90A-NEXT: v_accvgpr_mov_b32 a21, a51
+; FAST90A-NEXT: v_accvgpr_mov_b32 a22, a52
+; FAST90A-NEXT: v_accvgpr_mov_b32 a23, a53
+; FAST90A-NEXT: v_accvgpr_mov_b32 a24, a54
+; FAST90A-NEXT: v_accvgpr_mov_b32 a25, a55
+; FAST90A-NEXT: v_accvgpr_mov_b32 a26, a56
+; FAST90A-NEXT: v_accvgpr_mov_b32 a27, a57
+; FAST90A-NEXT: v_accvgpr_mov_b32 a28, a58
+; FAST90A-NEXT: v_accvgpr_mov_b32 a29, a59
+; FAST90A-NEXT: v_accvgpr_mov_b32 a30, a60
+; FAST90A-NEXT: v_accvgpr_mov_b32 a31, a61
; FAST90A-NEXT: s_nop 1
; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
; FAST90A-NEXT: s_nop 15
@@ -626,82 +594,98 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY908: ; %bb.0: ; %bb
; GREEDY908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GREEDY908-NEXT: v_mov_b32_e32 v0, 1.0
-; GREEDY908-NEXT: v_mov_b32_e32 v4, 0
+; GREEDY908-NEXT: v_mov_b32_e32 v16, 0
; GREEDY908-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY908-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GREEDY908-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY908-NEXT: v_mov_b32_e32 v5, s15
-; GREEDY908-NEXT: v_mov_b32_e32 v2, s14
-; GREEDY908-NEXT: v_mov_b32_e32 v1, s13
-; GREEDY908-NEXT: v_accvgpr_write_b32 a33, v5
-; GREEDY908-NEXT: v_mov_b32_e32 v5, s12
-; GREEDY908-NEXT: v_accvgpr_write_b32 a32, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v5
-; GREEDY908-NEXT: v_mov_b32_e32 v2, s11
-; GREEDY908-NEXT: v_mov_b32_e32 v1, s10
-; GREEDY908-NEXT: v_mov_b32_e32 v5, s9
-; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v5
-; GREEDY908-NEXT: v_mov_b32_e32 v2, s8
-; GREEDY908-NEXT: v_mov_b32_e32 v1, s7
-; GREEDY908-NEXT: v_mov_b32_e32 v5, s6
-; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v5
-; GREEDY908-NEXT: v_mov_b32_e32 v2, s5
-; GREEDY908-NEXT: v_mov_b32_e32 v1, s4
-; GREEDY908-NEXT: v_mov_b32_e32 v5, s3
-; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v5
-; GREEDY908-NEXT: v_mov_b32_e32 v2, s2
+; GREEDY908-NEXT: v_mov_b32_e32 v17, s0
; GREEDY908-NEXT: v_mov_b32_e32 v1, s1
-; GREEDY908-NEXT: v_mov_b32_e32 v5, s0
-; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v5
+; GREEDY908-NEXT: v_mov_b32_e32 v2, s2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v17
+; GREEDY908-NEXT: v_mov_b32_e32 v17, s3
+; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v17
+; GREEDY908-NEXT: v_mov_b32_e32 v1, s4
+; GREEDY908-NEXT: v_mov_b32_e32 v2, s5
+; GREEDY908-NEXT: v_mov_b32_e32 v17, s6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v17
+; GREEDY908-NEXT: v_mov_b32_e32 v1, s7
+; GREEDY908-NEXT: v_mov_b32_e32 v2, s8
+; GREEDY908-NEXT: v_mov_b32_e32 v17, s9
+; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v17
+; GREEDY908-NEXT: v_mov_b32_e32 v1, s10
+; GREEDY908-NEXT: v_mov_b32_e32 v2, s11
+; GREEDY908-NEXT: v_mov_b32_e32 v17, s12
+; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v17
+; GREEDY908-NEXT: v_mov_b32_e32 v1, s13
+; GREEDY908-NEXT: v_mov_b32_e32 v2, s14
+; GREEDY908-NEXT: v_mov_b32_e32 v17, s15
+; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v17
; GREEDY908-NEXT: v_mov_b32_e32 v1, 2.0
; GREEDY908-NEXT: s_nop 1
-; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
-; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
-; GREEDY908-NEXT: s_nop 8
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a19
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a18
-; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v3
+; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
+; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15]
+; GREEDY908-NEXT: s_nop 9
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a16
+; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a17
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a18
+; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v3
+; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v17
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a19
+; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a20
+; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v3
+; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v17
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a21
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a22
+; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a23
+; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v3
+; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v17
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a24
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a25
+; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a26
+; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v3
+; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v17
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a27
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a28
+; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a29
+; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v3
+; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v17
; GREEDY908-NEXT: s_nop 0
; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; GREEDY908-NEXT: s_nop 9
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13
; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12
-; GREEDY908-NEXT: s_nop 1
-; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:48
-; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9
-; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8
-; GREEDY908-NEXT: s_nop 1
-; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32
-; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5
-; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4
-; GREEDY908-NEXT: s_nop 1
-; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16
-; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0
-; GREEDY908-NEXT: s_nop 1
-; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15
+; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a0
+; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a2
+; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a3
+; GREEDY908-NEXT: v_accvgpr_read_b32 v4, a8
+; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a9
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a10
+; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a11
+; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a4
+; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a5
+; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a6
+; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a7
+; GREEDY908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
+; GREEDY908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
+; GREEDY908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
+; GREEDY908-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17]
; GREEDY908-NEXT: s_endpgm
;
; GREEDY90A-LABEL: test_mfma_f32_16x16x1f32:
@@ -709,39 +693,51 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0
; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0
-; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0
; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a33, s15
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a32, s14
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s13
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s12
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s11
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s10
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s9
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s8
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s7
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s6
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s5
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s4
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s3
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s2
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s1
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s0
-; GREEDY90A-NEXT: s_nop 1
-; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
-; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
-; GREEDY90A-NEXT: s_nop 9
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a0, s0
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a1, s1
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, s2
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a4, s4
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a5, s5
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a6, s6
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a7, s7
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a8, s8
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a9, s9
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a10, s10
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a11, s11
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a12, s12
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a13, s13
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a14, s14
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a15, s15
; GREEDY90A-NEXT: s_nop 1
; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
+; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15]
; GREEDY90A-NEXT: s_nop 10
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17]
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a2, a16
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a3, a17
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a4, a18
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a5, a19
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a6, a20
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a7, a21
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a8, a22
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a9, a23
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a10, a24
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a11, a25
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a12, a26
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a13, a27
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a14, a28
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a15, a29
+; GREEDY90A-NEXT: s_nop 1
+; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
+; GREEDY90A-NEXT: v_mov_b32_e32 v0, 0
+; GREEDY90A-NEXT: s_nop 9
+; GREEDY90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GREEDY90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GREEDY90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GREEDY90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GREEDY90A-NEXT: s_endpgm
;
; GREEDY942-LABEL: test_mfma_f32_16x16x1f32:
@@ -749,39 +745,51 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0
; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0
-; GREEDY942-NEXT: v_mov_b32_e32 v2, 0
; GREEDY942-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GREEDY942-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY942-NEXT: v_accvgpr_write_b32 a33, s15
-; GREEDY942-NEXT: v_accvgpr_write_b32 a32, s14
-; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s13
-; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s12
-; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s11
-; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s10
-; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s9
-; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s8
-; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s7
-; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s6
-; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s5
-; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s4
-; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s3
-; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s2
-; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s1
-; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s0
-; GREEDY942-NEXT: s_nop 1
-; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33]
-; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33]
-; GREEDY942-NEXT: s_nop 8
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19
+; GREEDY942-NEXT: v_accvgpr_write_b32 a0, s0
+; GREEDY942-NEXT: v_accvgpr_write_b32 a1, s1
+; GREEDY942-NEXT: v_accvgpr_write_b32 a2, s2
+; GREEDY942-NEXT: v_accvgpr_write_b32 a3, s3
+; GREEDY942-NEXT: v_accvgpr_write_b32 a4, s4
+; GREEDY942-NEXT: v_accvgpr_write_b32 a5, s5
+; GREEDY942-NEXT: v_accvgpr_write_b32 a6, s6
+; GREEDY942-NEXT: v_accvgpr_write_b32 a7, s7
+; GREEDY942-NEXT: v_accvgpr_write_b32 a8, s8
+; GREEDY942-NEXT: v_accvgpr_write_b32 a9, s9
+; GREEDY942-NEXT: v_accvgpr_write_b32 a10, s10
+; GREEDY942-NEXT: v_accvgpr_write_b32 a11, s11
+; GREEDY942-NEXT: v_accvgpr_write_b32 a12, s12
+; GREEDY942-NEXT: v_accvgpr_write_b32 a13, s13
+; GREEDY942-NEXT: v_accvgpr_write_b32 a14, s14
+; GREEDY942-NEXT: v_accvgpr_write_b32 a15, s15
; GREEDY942-NEXT: s_nop 1
; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
+; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[16:31], v0, v1, a[0:15]
; GREEDY942-NEXT: s_nop 9
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17]
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a16
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a3, a17
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a4, a18
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a5, a19
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a6, a20
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a7, a21
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a8, a22
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a9, a23
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a10, a24
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a11, a25
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a12, a26
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a13, a27
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a14, a28
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a15, a29
+; GREEDY942-NEXT: s_nop 1
+; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
+; GREEDY942-NEXT: v_mov_b32_e32 v0, 0
+; GREEDY942-NEXT: s_nop 8
+; GREEDY942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GREEDY942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GREEDY942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GREEDY942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GREEDY942-NEXT: s_endpgm
;
; GREEDY90A-GISEL-LABEL: test_mfma_f32_16x16x1f32:
@@ -839,9 +847,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; FAST90A-LABEL: test_mfma_f32_16x16x1f32:
; FAST90A: ; %bb.0: ; %bb
; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0
-; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0
-; FAST90A-NEXT: v_mov_b32_e32 v0, 0
+; FAST90A-NEXT: v_mov_b32_e32 v0, 1.0
+; FAST90A-NEXT: v_mov_b32_e32 v1, 2.0
; FAST90A-NEXT: s_waitcnt lgkmcnt(0)
; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0
; FAST90A-NEXT: s_waitcnt lgkmcnt(0)
@@ -862,8 +869,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; FAST90A-NEXT: v_accvgpr_write_b32 a14, s18
; FAST90A-NEXT: v_accvgpr_write_b32 a15, s19
; FAST90A-NEXT: s_nop 1
-; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
-; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15]
+; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
+; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15]
; FAST90A-NEXT: s_nop 10
; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a16
; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a17
@@ -880,8 +887,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a28
; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a29
; FAST90A-NEXT: s_nop 1
-; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
-; FAST90A-NEXT: s_nop 10
+; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
+; FAST90A-NEXT: v_mov_b32_e32 v0, 0
+; FAST90A-NEXT: s_nop 9
; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
index cf244f0b1f884..c77042d0c96c3 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -6,10 +6,10 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
; GFX942-LABEL: matmul_kernel:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
; GFX942-NEXT: s_mov_b32 s2, 0
; GFX942-NEXT: v_accvgpr_write_b32 a1, 0
-; GFX942-NEXT: s_mov_b32 s3, 0
+; GFX942-NEXT: s_mov_b32 s6, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -18,34 +18,33 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
; GFX942-NEXT: s_branch .LBB0_2
; GFX942-NEXT: .LBB0_1: ; %bb2
; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1
-; GFX942-NEXT: s_or_b32 s4, s3, 1
-; GFX942-NEXT: s_ashr_i32 s5, s3, 31
; GFX942-NEXT: s_mov_b32 s3, s2
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1
; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1
-; GFX942-NEXT: s_and_b32 s3, s5, s4
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3]
+; GFX942-NEXT: s_or_b32 s4, s6, 1
+; GFX942-NEXT: s_ashr_i32 s3, s6, 31
+; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[0:1], v[0:1], a[0:3]
+; GFX942-NEXT: s_and_b32 s6, s3, s4
+; GFX942-NEXT: s_nop 5
+; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2
; GFX942-NEXT: s_cbranch_execz .LBB0_4
; GFX942-NEXT: .LBB0_2: ; %bb
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1]
; GFX942-NEXT: s_cbranch_vccz .LBB0_1
; GFX942-NEXT: ; %bb.3:
-; GFX942-NEXT: ; implicit-def: $sgpr3
-; GFX942-NEXT: ; implicit-def: $agpr2
+; GFX942-NEXT: ; implicit-def: $sgpr6
; GFX942-NEXT: .LBB0_4: ; %common.ret
; GFX942-NEXT: s_endpgm
;
; GFX908-LABEL: matmul_kernel:
; GFX908: ; %bb.0: ; %entry
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX908-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a0, 0
; GFX908-NEXT: v_accvgpr_write_b32 a1, 0
; GFX908-NEXT: s_mov_b32 s2, 0
-; GFX908-NEXT: s_mov_b32 s3, 0
+; GFX908-NEXT: s_mov_b32 s6, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_cmp_lg_u32 s0, 0
; GFX908-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -54,28 +53,28 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
; GFX908-NEXT: s_branch .LBB0_2
; GFX908-NEXT: .LBB0_1: ; %bb2
; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1
-; GFX908-NEXT: s_or_b32 s4, s3, 1
-; GFX908-NEXT: s_ashr_i32 s5, s3, 31
; GFX908-NEXT: s_mov_b32 s3, s2
-; GFX908-NEXT: v_mov_b32_e32 v1, s2
-; GFX908-NEXT: s_nop 2
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a2
-; GFX908-NEXT: v_mov_b32_e32 v2, s3
+; GFX908-NEXT: v_mov_b32_e32 v0, s2
+; GFX908-NEXT: v_mov_b32_e32 v1, s3
; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a1
+; GFX908-NEXT: s_or_b32 s4, s6, 1
; GFX908-NEXT: v_accvgpr_write_b32 a2, v4
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
-; GFX908-NEXT: s_and_b32 s3, s5, s4
-; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[2:5], v[1:2], v[1:2], a[0:3]
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v2
+; GFX908-NEXT: s_ashr_i32 s3, s6, 31
+; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[2:5], v[0:1], v[0:1], a[0:3]
+; GFX908-NEXT: s_and_b32 s6, s3, s4
+; GFX908-NEXT: s_nop 8
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v2
; GFX908-NEXT: s_cbranch_execz .LBB0_4
; GFX908-NEXT: .LBB0_2: ; %bb
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1]
; GFX908-NEXT: s_cbranch_vccz .LBB0_1
; GFX908-NEXT: ; %bb.3:
-; GFX908-NEXT: ; implicit-def: $sgpr3
-; GFX908-NEXT: ; implicit-def: $agpr2
+; GFX908-NEXT: ; implicit-def: $sgpr6
; GFX908-NEXT: .LBB0_4: ; %common.ret
; GFX908-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
index 01506d0af1913..29f44282f06fc 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
@@ -83,13 +83,12 @@ body: |
; COALESCE-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
; COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
; COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
- ; COALESCE-NEXT: undef [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; COALESCE-NEXT: {{ $}}
; COALESCE-NEXT: bb.1:
; COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
; COALESCE-NEXT: {{ $}}
- ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0
; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
; COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
; COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
@@ -102,12 +101,12 @@ body: |
; COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc
; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
- ; COALESCE-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
- ; COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]]
- ; COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub1
- ; COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub1
- ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_]]
+ ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_]].sub1
+ ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_]].sub1
+ ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[AV_MOV_]], 0, 0, 0, implicit $mode, implicit $exec
; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
+ ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = COPY [[V_MFMA_F32_16X16X16F16_e64_]].sub0
; COALESCE-NEXT: {{ $}}
; COALESCE-NEXT: bb.3:
; COALESCE-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000)
@@ -137,13 +136,12 @@ body: |
; GFX908-COALESCE-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
; GFX908-COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
; GFX908-COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
- ; GFX908-COALESCE-NEXT: undef [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GFX908-COALESCE-NEXT: {{ $}}
; GFX908-COALESCE-NEXT: bb.1:
; GFX908-COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
; GFX908-COALESCE-NEXT: {{ $}}
- ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0
; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
; GFX908-COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
; GFX908-COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
@@ -156,12 +154,12 @@ body: |
; GFX908-COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc
; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
- ; GFX908-COALESCE-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
- ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]]
- ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub1
- ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub1
- ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_]]
+ ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_]].sub1
+ ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_]].sub1
+ ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[AV_MOV_]], 0, 0, 0, implicit $mode, implicit $exec
; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
+ ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = COPY [[V_MFMA_F32_16X16X16F16_e64_]].sub0
; GFX908-COALESCE-NEXT: {{ $}}
; GFX908-COALESCE-NEXT: bb.3:
; GFX908-COALESCE-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000)
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
index a9207de317ea1..17458fa8b08a7 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
@@ -74,7 +74,7 @@ body: |
; COALESCE-NEXT: successors: %bb.3(0x80000000)
; COALESCE-NEXT: {{ $}}
; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
- ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
+ ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec
; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
@@ -116,7 +116,7 @@ body: |
; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub0
; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub0
; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
- ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
+ ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec
; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index f4a9e7e8f2759..110604a7cd88e 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -521,8 +521,8 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: v_readlane_b32 s16, v39, 22
; GFX908-NEXT: s_mov_b32 s12, s24
; GFX908-NEXT: s_mov_b32 s13, s23
-; GFX908-NEXT: s_mov_b32 s14, s22
; GFX908-NEXT: v_mov_b32_e32 v31, v32
+; GFX908-NEXT: s_mov_b32 s14, s22
; GFX908-NEXT: s_mov_b32 s15, s21
; GFX908-NEXT: s_mov_b64 s[10:11], s[26:27]
; GFX908-NEXT: v_readlane_b32 s17, v39, 23
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
index fc154604b8700..4e6b9166b3ed0 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
@@ -27,7 +27,6 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) %
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v33, v34, a[0:31]
-; CHECK-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec
; CHECK-NEXT: s_cbranch_execz .LBB0_3
; CHECK-NEXT: s_branch .LBB0_4
; CHECK-NEXT: .LBB0_2:
@@ -47,7 +46,6 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) %
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v33, a[0:31]
-; CHECK-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec
; CHECK-NEXT: .LBB0_4: ; %endif
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:31]
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index b9e9893ede4e2..ecada6b300aa1 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -369,7 +369,7 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:3]
; CHECK-NEXT: ;;#ASMEND
@@ -378,73 +378,66 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00
; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[4:7]
-; CHECK-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00
; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[12:13], v[4:7]
-; CHECK-NEXT: s_nop 1
-; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[18:21], v[8:9], v[10:11], v[4:7]
-; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
-; CHECK-NEXT: v_accvgpr_write_b32 a2, v2
-; CHECK-NEXT: v_accvgpr_write_b32 a3, v3
+; CHECK-NEXT: v_accvgpr_write_b32 a0, s0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, s1
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7]
+; CHECK-NEXT: s_nop 2
; CHECK-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; CHECK-NEXT: v_mov_b32_e32 v5, v4
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: v_mov_b32_e32 v7, v4
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[14:17]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11]
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[4:7]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[4:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[12:13], v[4:7]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[26:29], v[8:9], v[8:9], v[4:7]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[0:3]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[22:25]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[8:9], v[8:9], v[26:29]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7]
; CHECK-NEXT: s_nop 5
-; CHECK-NEXT: v_cvt_f16_f32_e32 v23, v14
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[18:21]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[12:13], v[8:9], v[0:3]
-; CHECK-NEXT: s_nop 1
-; CHECK-NEXT: v_accvgpr_read_b32 v19, a3
-; CHECK-NEXT: v_accvgpr_read_b32 v18, a2
-; CHECK-NEXT: v_mov_b64_e32 v[20:21], 0
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: v_accvgpr_read_b32 v17, a1
-; CHECK-NEXT: v_accvgpr_read_b32 v16, a0
-; CHECK-NEXT: v_cvt_f16_f32_e32 v15, v22
-; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v14
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[8:9], v[8:9], v[16:19]
-; CHECK-NEXT: v_cvt_f16_f32_e32 v12, v0
-; CHECK-NEXT: global_store_short v[20:21], v23, off
+; CHECK-NEXT: v_cvt_f16_f32_e32 v17, v8
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15]
+; CHECK-NEXT: s_nop 2
+; CHECK-NEXT: v_mov_b64_e32 v[12:13], 0
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3]
+; CHECK-NEXT: global_store_short v[12:13], v17, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[10:11], v[8:9], v[4:7]
-; CHECK-NEXT: global_store_short v[20:21], v15, off
+; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v16
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7]
+; CHECK-NEXT: global_store_short v[12:13], v9, off
+; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v8
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27]
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: global_store_short v[20:21], v14, off
-; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v16
+; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v0
+; CHECK-NEXT: global_store_short v[12:13], v1, off
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23]
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: global_store_short v[20:21], v14, off
-; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CHECK-NEXT: global_store_short v[12:13], v14, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: global_store_short v[20:21], v12, off
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11]
+; CHECK-NEXT: s_nop 6
+; CHECK-NEXT: v_cvt_f16_f32_e32 v8, v0
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7]
+; CHECK-NEXT: global_store_short v[12:13], v8, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: global_store_short v[20:21], v0, off
+; CHECK-NEXT: s_nop 2
+; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CHECK-NEXT: global_store_short v[12:13], v0, off
; CHECK-NEXT: s_endpgm
entry:
%k0 = call <4 x float> asm sideeffect "; def $0", "=s"()
@@ -819,32 +812,32 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_subreg_
; CHECK-NEXT: ; def a[0:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_mov_b32_e32 v18, 4.0
-; CHECK-NEXT: v_accvgpr_mov_b32 a17, a16
-; CHECK-NEXT: v_accvgpr_mov_b32 a16, a15
-; CHECK-NEXT: v_accvgpr_mov_b32 a15, a14
-; CHECK-NEXT: v_accvgpr_mov_b32 a14, a13
-; CHECK-NEXT: v_accvgpr_mov_b32 a13, a12
-; CHECK-NEXT: v_accvgpr_mov_b32 a12, a11
-; CHECK-NEXT: v_accvgpr_mov_b32 a11, a10
-; CHECK-NEXT: v_accvgpr_mov_b32 a10, a9
-; CHECK-NEXT: v_accvgpr_mov_b32 a9, a8
-; CHECK-NEXT: v_accvgpr_mov_b32 a8, a7
-; CHECK-NEXT: v_accvgpr_mov_b32 a7, a6
-; CHECK-NEXT: v_accvgpr_mov_b32 a6, a5
-; CHECK-NEXT: v_accvgpr_mov_b32 a5, a4
-; CHECK-NEXT: v_accvgpr_mov_b32 a4, a3
-; CHECK-NEXT: v_accvgpr_mov_b32 a3, a2
-; CHECK-NEXT: v_accvgpr_mov_b32 a2, a1
+; CHECK-NEXT: v_accvgpr_mov_b32 a0, a1
+; CHECK-NEXT: v_accvgpr_mov_b32 a1, a2
+; CHECK-NEXT: v_accvgpr_mov_b32 a2, a3
+; CHECK-NEXT: v_accvgpr_mov_b32 a3, a4
+; CHECK-NEXT: v_accvgpr_mov_b32 a4, a5
+; CHECK-NEXT: v_accvgpr_mov_b32 a5, a6
+; CHECK-NEXT: v_accvgpr_mov_b32 a6, a7
+; CHECK-NEXT: v_accvgpr_mov_b32 a7, a8
+; CHECK-NEXT: v_accvgpr_mov_b32 a8, a9
+; CHECK-NEXT: v_accvgpr_mov_b32 a9, a10
+; CHECK-NEXT: v_accvgpr_mov_b32 a10, a11
+; CHECK-NEXT: v_accvgpr_mov_b32 a11, a12
+; CHECK-NEXT: v_accvgpr_mov_b32 a12, a13
+; CHECK-NEXT: v_accvgpr_mov_b32 a13, a14
+; CHECK-NEXT: v_accvgpr_mov_b32 a14, a15
+; CHECK-NEXT: v_accvgpr_mov_b32 a15, a16
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v1, v18, a[2:17]
+; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v18, a[0:15]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 6, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_nop 7
-; CHECK-NEXT: global_store_dwordx4 v0, a[14:17], s[0:1] offset:48
-; CHECK-NEXT: global_store_dwordx4 v0, a[10:13], s[0:1] offset:32
-; CHECK-NEXT: global_store_dwordx4 v0, a[6:9], s[0:1] offset:16
-; CHECK-NEXT: global_store_dwordx4 v0, a[2:5], s[0:1]
+; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; CHECK-NEXT: s_endpgm
%def = call <32 x float> asm sideeffect "; def $0", "=a"()
%src2 = shufflevector <32 x float> %def, <32 x float> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
index 4d864ad15b411..3ee558d6f8a9e 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
@@ -576,9 +576,9 @@ define void @shufflevector_v2i32_10_physreg_even_agpr_pair_copy(ptr addrspace(1)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a4, a5
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a4
-; GFX90A-NEXT: v_accvgpr_mov_b32 a0, a5
-; GFX90A-NEXT: global_store_dwordx2 v0, a[0:1], s[16:17]
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a4
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a5
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -590,9 +590,9 @@ define void @shufflevector_v2i32_10_physreg_even_agpr_pair_copy(ptr addrspace(1)
; GFX940-NEXT: ; def a4, a5
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_accvgpr_mov_b32 a1, a4
-; GFX940-NEXT: v_accvgpr_mov_b32 a0, a5
-; GFX940-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1]
+; GFX940-NEXT: v_accvgpr_read_b32 v3, a4
+; GFX940-NEXT: v_accvgpr_read_b32 v2, a5
+; GFX940-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32 } asm "; def $0, $1", "={a4},={a5}"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
index 34043cd067b25..50cdf11eea2f7 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
@@ -413,25 +413,27 @@ define void @v_shuffle_v2f32_v3f32__5_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -672,25 +674,27 @@ define void @v_shuffle_v2f32_v3f32__2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll
index f65340470feb1..a6a84c780cb32 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll
@@ -413,25 +413,27 @@ define void @v_shuffle_v2i32_v3i32__5_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -672,25 +674,27 @@ define void @v_shuffle_v2i32_v3i32__2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll
index 51dc9a51ec9d0..0b20caea9cd95 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll
@@ -291,27 +291,31 @@ define void @v_shuffle_v2i64_v2i64__3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2i64_v2i64__3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2i64_v2i64__3_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -474,27 +478,31 @@ define void @v_shuffle_v2i64_v2i64__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2i64_v2i64__1_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2i64_v2i64__1_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll
index 7f8f2dbbb09a1..2ecbf9622a259 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll
@@ -291,27 +291,31 @@ define void @v_shuffle_v2p0_v2p0__3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2p0_v2p0__3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2p0_v2p0__3_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -474,27 +478,31 @@ define void @v_shuffle_v2p0_v2p0__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2p0_v2p0__1_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2p0_v2p0__1_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll
index 13e3d94c35446..bacec04ab7600 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll
@@ -413,25 +413,27 @@ define void @v_shuffle_v2p3_v3p3__5_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -672,25 +674,27 @@ define void @v_shuffle_v2p3_v3p3__2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll
index 430f64164d24f..fb71492fb867d 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll
@@ -170,15 +170,15 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -186,15 +186,15 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -273,27 +273,27 @@ define void @v_shuffle_v3f32_v2f32__3_2_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -469,27 +469,29 @@ define void @v_shuffle_v3f32_v2f32__3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_3_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -561,26 +563,27 @@ define void @v_shuffle_v3f32_v2f32__u_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__u_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__u_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -651,27 +654,29 @@ define void @v_shuffle_v3f32_v2f32__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -696,26 +701,27 @@ define void @v_shuffle_v3f32_v2f32__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -744,32 +750,35 @@ define void @v_shuffle_v3f32_v2f32__3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -799,30 +808,33 @@ define void @v_shuffle_v3f32_v2f32__3_u_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_u_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -852,32 +864,35 @@ define void @v_shuffle_v3f32_v2f32__3_1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_1_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_1_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
-; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -907,33 +922,35 @@ define void @v_shuffle_v3f32_v2f32__3_2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:5]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v5
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1245,34 +1262,35 @@ define void @v_shuffle_v3f32_v2f32__3_0_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1302,34 +1320,35 @@ define void @v_shuffle_v3f32_v2f32__3_2_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:5]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v0, v5
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1459,27 +1478,29 @@ define void @v_shuffle_v3f32_v2f32__3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1506,25 +1527,27 @@ define void @v_shuffle_v3f32_v2f32__3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_u_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1553,34 +1576,35 @@ define void @v_shuffle_v3f32_v2f32__3_0_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:5]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v5
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1856,26 +1880,27 @@ define void @v_shuffle_v3f32_v2f32__3_u_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_u_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1905,34 +1930,35 @@ define void @v_shuffle_v3f32_v2f32__3_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:5]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v5
-; GFX942-NEXT: v_mov_b32_e32 v2, v5
-; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -2013,28 +2039,29 @@ define void @v_shuffle_v3f32_v2f32__3_2_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll
index ef670e963bdb6..1ab87d6f19ec4 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll
@@ -416,25 +416,27 @@ define void @v_shuffle_v3f32_v3f32__5_3_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -959,28 +961,29 @@ define void @v_shuffle_v3f32_v3f32__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v3f32__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v3f32__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1007,27 +1010,29 @@ define void @v_shuffle_v3f32_v3f32__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v3f32__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1395,13 +1400,14 @@ define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1411,13 +1417,14 @@ define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2008,13 +2015,14 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2024,13 +2032,14 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2859,28 +2868,29 @@ define void @v_shuffle_v3f32_v3f32__4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v3f32__4_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v3f32__4_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2908,27 +2918,29 @@ define void @v_shuffle_v3f32_v3f32__5_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3007,13 +3019,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3023,13 +3036,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3167,27 +3181,29 @@ define void @v_shuffle_v3f32_v3f32__5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_4_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3610,13 +3626,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3626,13 +3643,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3773,27 +3791,29 @@ define void @v_shuffle_v3f32_v3f32__5_3_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll
index 50c69de069986..c5a08f098b4c6 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll
@@ -965,26 +965,29 @@ define void @v_shuffle_v3f32_v4f32__7_7_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_7_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1011,26 +1014,29 @@ define void @v_shuffle_v3f32_v4f32__7_7_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_7_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1241,28 +1247,29 @@ define void @v_shuffle_v3f32_v4f32__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1289,26 +1296,29 @@ define void @v_shuffle_v3f32_v4f32__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1335,28 +1345,29 @@ define void @v_shuffle_v3f32_v4f32__3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__3_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__3_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1838,14 +1849,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1855,14 +1866,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -2667,14 +2678,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: v_mov_b32_e32 v10, v1
+; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2684,14 +2695,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -4664,28 +4675,29 @@ define void @v_shuffle_v3f32_v4f32__5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -4713,26 +4725,29 @@ define void @v_shuffle_v3f32_v4f32__6_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__6_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -4760,28 +4775,29 @@ define void @v_shuffle_v3f32_v4f32__7_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -4808,26 +4824,27 @@ define void @v_shuffle_v3f32_v4f32__7_u_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_u_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -4861,14 +4878,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v2
+; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4878,15 +4895,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -5081,28 +5097,29 @@ define void @v_shuffle_v3f32_v4f32__7_5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_5_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_5_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -5129,29 +5146,29 @@ define void @v_shuffle_v3f32_v4f32__7_6_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_6_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -5675,14 +5692,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v3
+; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5692,15 +5709,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v3
+; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -5899,28 +5915,29 @@ define void @v_shuffle_v3f32_v4f32__7_4_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -5947,29 +5964,29 @@ define void @v_shuffle_v3f32_v4f32__7_6_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_6_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -6706,29 +6723,29 @@ define void @v_shuffle_v3f32_v4f32__7_4_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_6:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -7484,29 +7501,29 @@ define void @v_shuffle_v3f32_v4f32__7_4_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll
index ea4fac3b1d2b1..91790ab5ff97f 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll
@@ -170,15 +170,15 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -186,15 +186,15 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -273,27 +273,27 @@ define void @v_shuffle_v3i32_v2i32__3_2_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -469,27 +469,29 @@ define void @v_shuffle_v3i32_v2i32__3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_3_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -561,26 +563,27 @@ define void @v_shuffle_v3i32_v2i32__u_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__u_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__u_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -651,27 +654,29 @@ define void @v_shuffle_v3i32_v2i32__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -696,26 +701,27 @@ define void @v_shuffle_v3i32_v2i32__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -744,32 +750,35 @@ define void @v_shuffle_v3i32_v2i32__3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -799,30 +808,33 @@ define void @v_shuffle_v3i32_v2i32__3_u_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_u_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -852,32 +864,35 @@ define void @v_shuffle_v3i32_v2i32__3_1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_1_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_1_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
-; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -907,33 +922,35 @@ define void @v_shuffle_v3i32_v2i32__3_2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:5]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v5
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -1245,34 +1262,35 @@ define void @v_shuffle_v3i32_v2i32__3_0_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -1302,34 +1320,35 @@ define void @v_shuffle_v3i32_v2i32__3_2_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:5]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v0, v5
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -1459,27 +1478,29 @@ define void @v_shuffle_v3i32_v2i32__3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -1506,25 +1527,27 @@ define void @v_shuffle_v3i32_v2i32__3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_u_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -1553,34 +1576,35 @@ define void @v_shuffle_v3i32_v2i32__3_0_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:5]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v5
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -1856,26 +1880,27 @@ define void @v_shuffle_v3i32_v2i32__3_u_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_u_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -1905,34 +1930,35 @@ define void @v_shuffle_v3i32_v2i32__3_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:5]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v5
-; GFX942-NEXT: v_mov_b32_e32 v2, v5
-; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -2013,28 +2039,29 @@ define void @v_shuffle_v3i32_v2i32__3_2_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll
index 7061c13b28d03..db780ced25148 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll
@@ -416,25 +416,27 @@ define void @v_shuffle_v3i32_v3i32__5_3_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -959,28 +961,29 @@ define void @v_shuffle_v3i32_v3i32__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v3i32__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v3i32__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1007,27 +1010,29 @@ define void @v_shuffle_v3i32_v3i32__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v3i32__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1395,13 +1400,14 @@ define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1411,13 +1417,14 @@ define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2008,13 +2015,14 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2024,13 +2032,14 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2859,28 +2868,29 @@ define void @v_shuffle_v3i32_v3i32__4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v3i32__4_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v3i32__4_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2908,27 +2918,29 @@ define void @v_shuffle_v3i32_v3i32__5_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3007,13 +3019,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3023,13 +3036,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3167,27 +3181,29 @@ define void @v_shuffle_v3i32_v3i32__5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_4_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3610,13 +3626,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3626,13 +3643,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3773,27 +3791,29 @@ define void @v_shuffle_v3i32_v3i32__5_3_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll
index 11d1897d0449f..92d6c95c26599 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll
@@ -965,26 +965,29 @@ define void @v_shuffle_v3i32_v4i32__7_7_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_7_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1011,26 +1014,29 @@ define void @v_shuffle_v3i32_v4i32__7_7_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_7_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1241,28 +1247,29 @@ define void @v_shuffle_v3i32_v4i32__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1289,26 +1296,29 @@ define void @v_shuffle_v3i32_v4i32__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1335,28 +1345,29 @@ define void @v_shuffle_v3i32_v4i32__3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__3_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__3_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1838,14 +1849,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1855,14 +1866,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -2667,14 +2678,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: v_mov_b32_e32 v10, v1
+; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2684,14 +2695,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -4664,28 +4675,29 @@ define void @v_shuffle_v3i32_v4i32__5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -4713,26 +4725,29 @@ define void @v_shuffle_v3i32_v4i32__6_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__6_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -4760,28 +4775,29 @@ define void @v_shuffle_v3i32_v4i32__7_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -4808,26 +4824,27 @@ define void @v_shuffle_v3i32_v4i32__7_u_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_u_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -4861,14 +4878,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v2
+; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4878,15 +4895,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -5081,28 +5097,29 @@ define void @v_shuffle_v3i32_v4i32__7_5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_5_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_5_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -5129,29 +5146,29 @@ define void @v_shuffle_v3i32_v4i32__7_6_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_6_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -5675,14 +5692,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v3
+; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5692,15 +5709,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v3
+; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -5899,28 +5915,29 @@ define void @v_shuffle_v3i32_v4i32__7_4_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -5947,29 +5964,29 @@ define void @v_shuffle_v3i32_v4i32__7_6_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_6_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -6706,29 +6723,29 @@ define void @v_shuffle_v3i32_v4i32__7_4_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_6:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -7484,29 +7501,29 @@ define void @v_shuffle_v3i32_v4i32__7_4_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll
index a15fc3212f474..bbca5039bb02c 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll
@@ -291,27 +291,31 @@ define void @v_shuffle_v3i64_v2i64__3_2_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -695,28 +699,32 @@ define void @v_shuffle_v3i64_v2i64__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i64_v2i64__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i64_v2i64__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -1555,28 +1563,32 @@ define void @v_shuffle_v3i64_v2i64__3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -2148,28 +2160,32 @@ define void @v_shuffle_v3i64_v2i64__3_2_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll
index fe132493ce536..8757639c501d2 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll
@@ -291,27 +291,31 @@ define void @v_shuffle_v3p0_v2p0__3_2_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -695,28 +699,32 @@ define void @v_shuffle_v3p0_v2p0__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p0_v2p0__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p0_v2p0__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -1555,28 +1563,32 @@ define void @v_shuffle_v3p0_v2p0__3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -2148,28 +2160,32 @@ define void @v_shuffle_v3p0_v2p0__3_2_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll
index bd0100a4ffdb5..6d294b58ceeec 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll
@@ -170,15 +170,15 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -186,15 +186,15 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -273,27 +273,27 @@ define void @v_shuffle_v3p3_v2p3__3_2_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -469,27 +469,29 @@ define void @v_shuffle_v3p3_v2p3__3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_3_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -561,26 +563,27 @@ define void @v_shuffle_v3p3_v2p3__u_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__u_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__u_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -651,27 +654,29 @@ define void @v_shuffle_v3p3_v2p3__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -696,26 +701,27 @@ define void @v_shuffle_v3p3_v2p3__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -744,32 +750,35 @@ define void @v_shuffle_v3p3_v2p3__3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -799,30 +808,33 @@ define void @v_shuffle_v3p3_v2p3__3_u_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_u_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -852,32 +864,35 @@ define void @v_shuffle_v3p3_v2p3__3_1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_1_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_1_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
-; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -907,33 +922,35 @@ define void @v_shuffle_v3p3_v2p3__3_2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:5]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v5
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1245,34 +1262,35 @@ define void @v_shuffle_v3p3_v2p3__3_0_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1302,34 +1320,35 @@ define void @v_shuffle_v3p3_v2p3__3_2_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:5]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v0, v5
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1459,27 +1478,29 @@ define void @v_shuffle_v3p3_v2p3__3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1506,25 +1527,27 @@ define void @v_shuffle_v3p3_v2p3__3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_u_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1553,34 +1576,35 @@ define void @v_shuffle_v3p3_v2p3__3_0_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:5]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v5
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1856,26 +1880,27 @@ define void @v_shuffle_v3p3_v2p3__3_u_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_u_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1905,34 +1930,35 @@ define void @v_shuffle_v3p3_v2p3__3_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:5]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v5
-; GFX942-NEXT: v_mov_b32_e32 v2, v5
-; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2013,28 +2039,29 @@ define void @v_shuffle_v3p3_v2p3__3_2_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll
index cecd2a0e4b015..88d43df5938ee 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll
@@ -416,25 +416,27 @@ define void @v_shuffle_v3p3_v3p3__5_3_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -959,28 +961,29 @@ define void @v_shuffle_v3p3_v3p3__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v3p3__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v3p3__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1007,27 +1010,29 @@ define void @v_shuffle_v3p3_v3p3__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v3p3__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1395,13 +1400,14 @@ define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1411,13 +1417,14 @@ define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2008,13 +2015,14 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2024,13 +2032,14 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2859,28 +2868,29 @@ define void @v_shuffle_v3p3_v3p3__4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v3p3__4_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v3p3__4_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2908,27 +2918,29 @@ define void @v_shuffle_v3p3_v3p3__5_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3007,13 +3019,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3023,13 +3036,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3167,27 +3181,29 @@ define void @v_shuffle_v3p3_v3p3__5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_4_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3610,13 +3626,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3626,13 +3643,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3773,27 +3791,29 @@ define void @v_shuffle_v3p3_v3p3__5_3_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll
index 834f03f013ba1..c9f194d873e35 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll
@@ -965,26 +965,29 @@ define void @v_shuffle_v3p3_v4p3__7_7_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_7_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1011,26 +1014,29 @@ define void @v_shuffle_v3p3_v4p3__7_7_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_7_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1241,28 +1247,29 @@ define void @v_shuffle_v3p3_v4p3__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1289,26 +1296,29 @@ define void @v_shuffle_v3p3_v4p3__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1335,28 +1345,29 @@ define void @v_shuffle_v3p3_v4p3__3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__3_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__3_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1838,14 +1849,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1855,14 +1866,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2667,14 +2678,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: v_mov_b32_e32 v10, v1
+; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2684,14 +2695,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4664,28 +4675,29 @@ define void @v_shuffle_v3p3_v4p3__5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4713,26 +4725,29 @@ define void @v_shuffle_v3p3_v4p3__6_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__6_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4760,28 +4775,29 @@ define void @v_shuffle_v3p3_v4p3__7_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4808,26 +4824,27 @@ define void @v_shuffle_v3p3_v4p3__7_u_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_u_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4861,14 +4878,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v2
+; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4878,15 +4895,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5081,28 +5097,29 @@ define void @v_shuffle_v3p3_v4p3__7_5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_5_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_5_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5129,29 +5146,29 @@ define void @v_shuffle_v3p3_v4p3__7_6_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_6_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5675,14 +5692,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v3
+; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5692,15 +5709,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v3
+; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5899,28 +5915,29 @@ define void @v_shuffle_v3p3_v4p3__7_4_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5947,29 +5964,29 @@ define void @v_shuffle_v3p3_v4p3__7_6_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_6_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -6706,29 +6723,29 @@ define void @v_shuffle_v3p3_v4p3__7_4_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_6:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -7484,29 +7501,29 @@ define void @v_shuffle_v3p3_v4p3__7_4_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll
index df148f299a165..c7092f04a23ed 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll
@@ -272,27 +272,27 @@ define void @v_shuffle_v4f32_v2f32__3_2_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_2_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v2f32__3_2_u_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -2380,28 +2380,29 @@ define void @v_shuffle_v4f32_v2f32__3_3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll
index d4ee6fa20cad8..1224ab2b381c9 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll
@@ -255,15 +255,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -271,15 +271,16 @@ define void @v_shuffle_v4f32_v3f32__5_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -413,27 +414,27 @@ define void @v_shuffle_v4f32_v3f32__5_3_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -553,15 +554,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -569,16 +571,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -609,16 +612,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -626,17 +629,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -719,27 +722,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -766,28 +771,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1031,31 +1037,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_5_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1083,28 +1089,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_5_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1180,28 +1189,29 @@ define void @v_shuffle_v4f32_v3f32__u_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1325,31 +1335,31 @@ define void @v_shuffle_v4f32_v3f32__2_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__2_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1375,28 +1385,29 @@ define void @v_shuffle_v4f32_v3f32__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1488,15 +1499,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1506,15 +1517,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1545,34 +1556,33 @@ define void @v_shuffle_v4f32_v3f32__5_u_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1606,15 +1616,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1624,15 +1634,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1664,17 +1674,17 @@ define void @v_shuffle_v4f32_v3f32__5_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1682,17 +1692,17 @@ define void @v_shuffle_v4f32_v3f32__5_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1724,17 +1734,17 @@ define void @v_shuffle_v4f32_v3f32__5_3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v9, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[6:8]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v8
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1742,17 +1752,17 @@ define void @v_shuffle_v4f32_v3f32__5_3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[6:8]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v0, v8
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1843,15 +1853,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1861,15 +1871,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1901,16 +1911,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1918,16 +1928,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2018,17 +2028,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v6
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2036,17 +2046,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v6
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2078,17 +2088,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v9, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[6:8]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v8
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
-; GFX90A-NEXT: v_mov_b32_e32 v2, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2096,17 +2106,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[6:8]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: v_mov_b32_e32 v0, v8
-; GFX942-NEXT: v_mov_b32_e32 v1, v8
-; GFX942-NEXT: v_mov_b32_e32 v2, v6
-; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2603,16 +2613,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2620,16 +2631,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2661,16 +2673,17 @@ define void @v_shuffle_v4f32_v3f32__5_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2678,17 +2691,17 @@ define void @v_shuffle_v4f32_v3f32__5_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2720,16 +2733,17 @@ define void @v_shuffle_v4f32_v3f32__5_3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2737,16 +2751,17 @@ define void @v_shuffle_v4f32_v3f32__5_3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2834,16 +2849,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2851,17 +2867,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2893,15 +2909,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2909,16 +2926,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2949,15 +2966,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2965,16 +2984,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3006,16 +3026,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v6
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3023,17 +3044,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v6
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3065,16 +3086,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3082,17 +3104,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3124,16 +3146,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3141,17 +3164,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v5
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3584,17 +3607,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[6:8]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v8
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3602,17 +3625,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[6:8]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v8
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3871,16 +3894,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v6
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3888,16 +3911,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v6
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3929,16 +3952,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[6:8]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v8
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v6
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3946,16 +3970,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[6:8]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: v_mov_b32_e32 v0, v8
-; GFX942-NEXT: v_mov_b32_e32 v1, v8
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v6
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4045,36 +4070,37 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[6:8]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v8
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
-; GFX90A-NEXT: v_mov_b32_e32 v2, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v6
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v10, v4
+; GFX90A-NEXT: v_mov_b32_e32 v11, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[6:8]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: v_mov_b32_e32 v0, v8
-; GFX942-NEXT: v_mov_b32_e32 v1, v8
-; GFX942-NEXT: v_mov_b32_e32 v2, v6
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v6
+; GFX942-NEXT: v_mov_b32_e32 v9, v6
+; GFX942-NEXT: v_mov_b32_e32 v10, v4
+; GFX942-NEXT: v_mov_b32_e32 v11, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4358,31 +4384,31 @@ define void @v_shuffle_v4f32_v3f32__5_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4410,29 +4436,27 @@ define void @v_shuffle_v4f32_v3f32__5_u_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4463,17 +4487,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4481,17 +4505,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4580,36 +4604,35 @@ define void @v_shuffle_v4f32_v3f32__5_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v4
+; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_2_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v4
+; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4637,31 +4660,31 @@ define void @v_shuffle_v4f32_v3f32__5_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_4_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_4_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4689,31 +4712,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4741,28 +4764,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4793,16 +4817,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4810,17 +4835,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5376,28 +5401,27 @@ define void @v_shuffle_v4f32_v3f32__5_u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5428,17 +5452,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5446,17 +5470,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v5
-; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5543,36 +5567,35 @@ define void @v_shuffle_v4f32_v3f32__5_2_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_2_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v5
-; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5599,28 +5622,31 @@ define void @v_shuffle_v4f32_v3f32__5_3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5648,28 +5674,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5697,27 +5726,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5749,16 +5780,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5766,17 +5798,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5808,17 +5840,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5826,17 +5858,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5922,27 +5954,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -6508,30 +6544,31 @@ define void @v_shuffle_v4f32_v3f32__5_3_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_5_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -6604,28 +6641,29 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -6657,16 +6695,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v6
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6674,17 +6713,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -6830,28 +6869,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll
index edc540edb3ad1..d5bd41397c4f0 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll
@@ -963,26 +963,29 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1009,26 +1012,29 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1440,31 +1446,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_7_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1734,28 +1740,31 @@ define void @v_shuffle_v4f32_v4f32__2_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__2_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -2785,14 +2794,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v2
+; GFX90A-NEXT: v_mov_b32_e32 v11, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2802,14 +2812,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v11, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -4138,14 +4149,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v2
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4155,14 +4167,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -4196,14 +4209,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v3
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4213,14 +4227,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v3
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -5448,34 +5463,37 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v7
-; GFX90A-NEXT: v_mov_b32_e32 v8, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v10, v7
+; GFX90A-NEXT: v_mov_b32_e32 v11, v7
+; GFX90A-NEXT: v_mov_b32_e32 v12, v4
+; GFX90A-NEXT: v_mov_b32_e32 v13, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:7]
+; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:3]
+; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v7
-; GFX942-NEXT: v_mov_b32_e32 v8, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v10, v7
+; GFX942-NEXT: v_mov_b32_e32 v11, v7
+; GFX942-NEXT: v_mov_b32_e32 v12, v4
+; GFX942-NEXT: v_mov_b32_e32 v13, v2
+; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -7172,28 +7190,31 @@ define void @v_shuffle_v4f32_v4f32__6_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_4_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__6_4_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -7271,28 +7292,29 @@ define void @v_shuffle_v4f32_v4f32__7_u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -7559,30 +7581,31 @@ define void @v_shuffle_v4f32_v4f32__7_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -7660,28 +7683,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -7763,14 +7789,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v2
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7780,14 +7807,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v11, v2
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -8041,31 +8069,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_6_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -8946,28 +8974,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -9047,14 +9078,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v3
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -9064,14 +9096,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v11, v3
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -9105,14 +9138,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v1
+; GFX90A-NEXT: v_mov_b32_e32 v11, v3
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -9122,14 +9156,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v11, v3
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -9277,28 +9312,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -9327,31 +9365,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_6_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -10319,15 +10357,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v5
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v4
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -10337,15 +10375,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_6(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v5
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v4
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v11, v4
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -10553,31 +10591,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_6:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -11777,31 +11815,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -11830,31 +11868,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll
index 9d3affa6da266..03503c9dac197 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll
@@ -272,27 +272,27 @@ define void @v_shuffle_v4i32_v2i32__3_2_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_2_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v2i32__3_2_u_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -2386,28 +2386,29 @@ define void @v_shuffle_v4i32_v2i32__3_3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll
index 1a669adf2b635..0222f73fbd193 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll
@@ -255,15 +255,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -271,15 +271,16 @@ define void @v_shuffle_v4i32_v3i32__5_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -413,27 +414,27 @@ define void @v_shuffle_v4i32_v3i32__5_3_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -553,15 +554,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -569,16 +571,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -609,16 +612,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -626,17 +629,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -719,27 +722,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -766,28 +771,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1031,31 +1037,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_5_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1083,28 +1089,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_5_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1180,28 +1189,29 @@ define void @v_shuffle_v4i32_v3i32__u_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1325,31 +1335,31 @@ define void @v_shuffle_v4i32_v3i32__2_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__2_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1375,28 +1385,29 @@ define void @v_shuffle_v4i32_v3i32__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1488,15 +1499,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1506,15 +1517,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1545,34 +1556,33 @@ define void @v_shuffle_v4i32_v3i32__5_u_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1606,15 +1616,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1624,15 +1634,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1664,17 +1674,17 @@ define void @v_shuffle_v4i32_v3i32__5_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1682,17 +1692,17 @@ define void @v_shuffle_v4i32_v3i32__5_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1724,17 +1734,17 @@ define void @v_shuffle_v4i32_v3i32__5_3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v9, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[6:8]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v8
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1742,17 +1752,17 @@ define void @v_shuffle_v4i32_v3i32__5_3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[6:8]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v0, v8
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1843,15 +1853,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1861,15 +1871,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1901,16 +1911,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1918,16 +1928,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2018,17 +2028,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v6
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2036,17 +2046,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v6
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2078,17 +2088,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v9, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[6:8]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v8
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
-; GFX90A-NEXT: v_mov_b32_e32 v2, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2096,17 +2106,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[6:8]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: v_mov_b32_e32 v0, v8
-; GFX942-NEXT: v_mov_b32_e32 v1, v8
-; GFX942-NEXT: v_mov_b32_e32 v2, v6
-; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2603,16 +2613,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2620,16 +2631,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2661,16 +2673,17 @@ define void @v_shuffle_v4i32_v3i32__5_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2678,17 +2691,17 @@ define void @v_shuffle_v4i32_v3i32__5_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2720,16 +2733,17 @@ define void @v_shuffle_v4i32_v3i32__5_3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2737,16 +2751,17 @@ define void @v_shuffle_v4i32_v3i32__5_3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2834,16 +2849,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2851,17 +2867,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2893,15 +2909,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2909,16 +2926,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2949,15 +2966,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2965,16 +2984,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3006,16 +3026,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v6
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3023,17 +3044,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v6
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3065,16 +3086,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3082,17 +3104,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3124,16 +3146,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3141,17 +3164,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v5
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3584,17 +3607,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[6:8]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v8
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3602,17 +3625,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[6:8]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v8
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3871,16 +3894,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v6
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3888,16 +3911,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v6
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3929,16 +3952,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[6:8]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v8
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v6
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3946,16 +3970,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[6:8]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: v_mov_b32_e32 v0, v8
-; GFX942-NEXT: v_mov_b32_e32 v1, v8
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v6
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4045,36 +4070,37 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[6:8]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v8
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
-; GFX90A-NEXT: v_mov_b32_e32 v2, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v6
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v10, v4
+; GFX90A-NEXT: v_mov_b32_e32 v11, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[6:8]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: v_mov_b32_e32 v0, v8
-; GFX942-NEXT: v_mov_b32_e32 v1, v8
-; GFX942-NEXT: v_mov_b32_e32 v2, v6
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v6
+; GFX942-NEXT: v_mov_b32_e32 v9, v6
+; GFX942-NEXT: v_mov_b32_e32 v10, v4
+; GFX942-NEXT: v_mov_b32_e32 v11, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4358,31 +4384,31 @@ define void @v_shuffle_v4i32_v3i32__5_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4410,29 +4436,27 @@ define void @v_shuffle_v4i32_v3i32__5_u_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4463,17 +4487,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4481,17 +4505,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4580,36 +4604,35 @@ define void @v_shuffle_v4i32_v3i32__5_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v4
+; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_2_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v4
+; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4637,31 +4660,31 @@ define void @v_shuffle_v4i32_v3i32__5_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_4_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_4_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4689,31 +4712,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4741,28 +4764,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4793,16 +4817,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4810,17 +4835,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5376,28 +5401,27 @@ define void @v_shuffle_v4i32_v3i32__5_u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5428,17 +5452,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5446,17 +5470,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v5
-; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5543,36 +5567,35 @@ define void @v_shuffle_v4i32_v3i32__5_2_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_2_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v5
-; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5599,28 +5622,31 @@ define void @v_shuffle_v4i32_v3i32__5_3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5648,28 +5674,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5697,27 +5726,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5749,16 +5780,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5766,17 +5798,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5808,17 +5840,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5826,17 +5858,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5922,27 +5954,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -6508,30 +6544,31 @@ define void @v_shuffle_v4i32_v3i32__5_3_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_5_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -6604,28 +6641,29 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -6657,16 +6695,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v6
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6674,17 +6713,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -6830,28 +6869,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll
index 983afa566e2c1..ee2f94b90ffa9 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll
@@ -963,26 +963,29 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1009,26 +1012,29 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1440,31 +1446,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_7_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1734,28 +1740,31 @@ define void @v_shuffle_v4i32_v4i32__2_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__2_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -2785,14 +2794,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v2
+; GFX90A-NEXT: v_mov_b32_e32 v11, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2802,14 +2812,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v11, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -4138,14 +4149,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v2
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4155,14 +4167,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -4196,14 +4209,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v3
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4213,14 +4227,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v3
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -5448,34 +5463,37 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v7
-; GFX90A-NEXT: v_mov_b32_e32 v8, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v10, v7
+; GFX90A-NEXT: v_mov_b32_e32 v11, v7
+; GFX90A-NEXT: v_mov_b32_e32 v12, v4
+; GFX90A-NEXT: v_mov_b32_e32 v13, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:7]
+; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:3]
+; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v7
-; GFX942-NEXT: v_mov_b32_e32 v8, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v10, v7
+; GFX942-NEXT: v_mov_b32_e32 v11, v7
+; GFX942-NEXT: v_mov_b32_e32 v12, v4
+; GFX942-NEXT: v_mov_b32_e32 v13, v2
+; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -7172,28 +7190,31 @@ define void @v_shuffle_v4i32_v4i32__6_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_4_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__6_4_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -7271,28 +7292,29 @@ define void @v_shuffle_v4i32_v4i32__7_u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -7559,30 +7581,31 @@ define void @v_shuffle_v4i32_v4i32__7_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -7660,28 +7683,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -7763,14 +7789,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v2
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7780,14 +7807,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v11, v2
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -8041,31 +8069,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_6_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -8946,28 +8974,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -9047,14 +9078,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v3
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -9064,14 +9096,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v11, v3
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -9105,14 +9138,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v1
+; GFX90A-NEXT: v_mov_b32_e32 v11, v3
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -9122,14 +9156,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v11, v3
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -9277,28 +9312,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -9327,31 +9365,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_6_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -10319,15 +10357,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v5
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v4
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -10337,15 +10375,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_6(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v5
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v4
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v11, v4
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -10553,31 +10591,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_6:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -11777,31 +11815,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -11830,31 +11868,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll
index ac7d9557ce765..21ec9acf6317d 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll
@@ -291,27 +291,31 @@ define void @v_shuffle_v4i64_v2i64__3_2_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_u_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -624,15 +628,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -642,18 +646,18 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -752,15 +756,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -770,15 +774,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -953,33 +957,39 @@ define void @v_shuffle_v4i64_v2i64__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v2i64__1_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i64_v2i64__1_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -1206,18 +1216,18 @@ define void @v_shuffle_v4i64_v2i64__3_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v12, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v10, v0
; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v12, v0
+; GFX90A-NEXT: v_mov_b32_e32 v13, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1227,18 +1237,18 @@ define void @v_shuffle_v4i64_v2i64__3_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v12, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v10, v0
; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v12, v0
+; GFX942-NEXT: v_mov_b32_e32 v13, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -1556,15 +1566,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1574,18 +1584,18 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -1663,33 +1673,33 @@ define void @v_shuffle_v4i64_v2i64__0_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v2i64__0_1_1_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i64_v2i64__0_1_1_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v2
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -1961,17 +1971,17 @@ define void @v_shuffle_v4i64_v2i64__3_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v12, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: v_mov_b32_e32 v10, v2
; GFX90A-NEXT: v_mov_b32_e32 v11, v3
+; GFX90A-NEXT: v_mov_b32_e32 v12, v2
+; GFX90A-NEXT: v_mov_b32_e32 v13, v3
; GFX90A-NEXT: v_mov_b32_e32 v2, v6
; GFX90A-NEXT: v_mov_b32_e32 v3, v7
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1984,17 +1994,17 @@ define void @v_shuffle_v4i64_v2i64__3_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v12, 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: v_mov_b32_e32 v10, v2
; GFX942-NEXT: v_mov_b32_e32 v11, v3
+; GFX942-NEXT: v_mov_b32_e32 v12, v2
+; GFX942-NEXT: v_mov_b32_e32 v13, v3
; GFX942-NEXT: v_mov_b32_e32 v2, v6
; GFX942-NEXT: v_mov_b32_e32 v3, v7
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -2447,33 +2457,39 @@ define void @v_shuffle_v4i64_v2i64__3_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -2508,15 +2524,15 @@ define void @v_shuffle_v4i64_v2i64__3_u_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2526,15 +2542,15 @@ define void @v_shuffle_v4i64_v2i64__3_u_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -2577,17 +2593,17 @@ define void @v_shuffle_v4i64_v2i64__3_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: v_mov_b32_e32 v8, v2
; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: v_mov_b32_e32 v10, v2
+; GFX90A-NEXT: v_mov_b32_e32 v11, v3
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2597,21 +2613,21 @@ define void @v_shuffle_v4i64_v2i64__3_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
; GFX942-NEXT: v_mov_b32_e32 v8, v2
; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-NEXT: v_mov_b32_e32 v11, v3
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -2649,18 +2665,18 @@ define void @v_shuffle_v4i64_v2i64__3_1_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v12, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
; GFX90A-NEXT: v_mov_b32_e32 v10, v4
; GFX90A-NEXT: v_mov_b32_e32 v11, v5
+; GFX90A-NEXT: v_mov_b32_e32 v12, v4
+; GFX90A-NEXT: v_mov_b32_e32 v13, v5
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2670,19 +2686,19 @@ define void @v_shuffle_v4i64_v2i64__3_1_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v12, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
; GFX942-NEXT: v_mov_b32_e32 v10, v4
; GFX942-NEXT: v_mov_b32_e32 v11, v5
+; GFX942-NEXT: v_mov_b32_e32 v12, v4
+; GFX942-NEXT: v_mov_b32_e32 v13, v5
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -2717,15 +2733,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2735,15 +2751,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -2776,13 +2792,13 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2792,13 +2808,13 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -3134,33 +3150,33 @@ define void @v_shuffle_v4i64_v2i64__2_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v2i64__2_3_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i64_v2i64__2_3_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v2
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -3374,39 +3390,39 @@ define void @v_shuffle_v4i64_v2i64__3_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v2
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
index 8dd4a40d00680..615b382aa355a 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
@@ -1126,15 +1126,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1144,15 +1144,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -1388,13 +1388,15 @@ define void @v_shuffle_v4i64_v3i64__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1404,13 +1406,15 @@ define void @v_shuffle_v4i64_v3i64__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -3637,33 +3641,33 @@ define void @v_shuffle_v4i64_v3i64__1_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_2_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
; GFX90A-NEXT: v_mov_b32_e32 v8, v4
; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v10, v4
+; GFX90A-NEXT: v_mov_b32_e32 v11, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i64_v3i64__1_2_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
; GFX942-NEXT: v_mov_b32_e32 v8, v4
; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v4
+; GFX942-NEXT: v_mov_b32_e32 v11, v5
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -4787,13 +4791,15 @@ define void @v_shuffle_v4i64_v3i64__4_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4803,13 +4809,15 @@ define void @v_shuffle_v4i64_v3i64__4_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -5169,15 +5177,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: v_mov_b32_e32 v8, v0
; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5187,15 +5195,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: v_mov_b32_e32 v8, v0
; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -5532,15 +5540,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5550,15 +5558,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -6255,17 +6263,17 @@ define void @v_shuffle_v4i64_v3i64__5_3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: v_mov_b32_e32 v8, v2
; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: v_mov_b32_e32 v10, v2
+; GFX90A-NEXT: v_mov_b32_e32 v11, v3
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6275,17 +6283,17 @@ define void @v_shuffle_v4i64_v3i64__5_3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: v_mov_b32_e32 v8, v2
; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-NEXT: v_mov_b32_e32 v11, v3
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -6970,33 +6978,33 @@ define void @v_shuffle_v4i64_v3i64__4_5_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v3i64__4_5_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
; GFX90A-NEXT: v_mov_b32_e32 v8, v4
; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v10, v4
+; GFX90A-NEXT: v_mov_b32_e32 v11, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i64_v3i64__4_5_5_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
; GFX942-NEXT: v_mov_b32_e32 v8, v4
; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v4
+; GFX942-NEXT: v_mov_b32_e32 v11, v5
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -7344,15 +7352,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: v_mov_b32_e32 v8, v4
; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v4
+; GFX90A-NEXT: v_mov_b32_e32 v11, v5
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7362,15 +7370,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: v_mov_b32_e32 v8, v4
; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v4
+; GFX942-NEXT: v_mov_b32_e32 v11, v5
; GFX942-NEXT: v_mov_b32_e32 v0, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll
index ea9ef2f1ac94a..32f6e00716e37 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll
@@ -8328,15 +8328,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v12, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: v_mov_b32_e32 v10, v0
; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v12, v0
+; GFX90A-NEXT: v_mov_b32_e32 v13, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -8346,15 +8346,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v12, 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: v_mov_b32_e32 v10, v0
; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v12, v0
+; GFX942-NEXT: v_mov_b32_e32 v13, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i64> asm "; def $0", "=v"()
@@ -11254,15 +11254,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v12, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: v_mov_b32_e32 v10, v4
; GFX90A-NEXT: v_mov_b32_e32 v11, v5
+; GFX90A-NEXT: v_mov_b32_e32 v12, v4
+; GFX90A-NEXT: v_mov_b32_e32 v13, v5
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -11272,15 +11272,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_6_6(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v12, 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: v_mov_b32_e32 v10, v4
; GFX942-NEXT: v_mov_b32_e32 v11, v5
+; GFX942-NEXT: v_mov_b32_e32 v12, v4
+; GFX942-NEXT: v_mov_b32_e32 v13, v5
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i64> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll
index b30af835a7882..ee3b303f88471 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll
@@ -291,27 +291,31 @@ define void @v_shuffle_v4p0_v2p0__3_2_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_u_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -624,15 +628,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -642,18 +646,18 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -752,15 +756,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -770,15 +774,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -953,33 +957,39 @@ define void @v_shuffle_v4p0_v2p0__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v2p0__1_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p0_v2p0__1_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -1206,18 +1216,18 @@ define void @v_shuffle_v4p0_v2p0__3_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v12, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v10, v0
; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v12, v0
+; GFX90A-NEXT: v_mov_b32_e32 v13, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1227,18 +1237,18 @@ define void @v_shuffle_v4p0_v2p0__3_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v12, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v10, v0
; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v12, v0
+; GFX942-NEXT: v_mov_b32_e32 v13, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -1556,15 +1566,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1574,18 +1584,18 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -1663,33 +1673,33 @@ define void @v_shuffle_v4p0_v2p0__0_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v2p0__0_1_1_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p0_v2p0__0_1_1_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v2
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -1961,17 +1971,17 @@ define void @v_shuffle_v4p0_v2p0__3_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v12, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: v_mov_b32_e32 v10, v2
; GFX90A-NEXT: v_mov_b32_e32 v11, v3
+; GFX90A-NEXT: v_mov_b32_e32 v12, v2
+; GFX90A-NEXT: v_mov_b32_e32 v13, v3
; GFX90A-NEXT: v_mov_b32_e32 v2, v6
; GFX90A-NEXT: v_mov_b32_e32 v3, v7
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1984,17 +1994,17 @@ define void @v_shuffle_v4p0_v2p0__3_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v12, 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: v_mov_b32_e32 v10, v2
; GFX942-NEXT: v_mov_b32_e32 v11, v3
+; GFX942-NEXT: v_mov_b32_e32 v12, v2
+; GFX942-NEXT: v_mov_b32_e32 v13, v3
; GFX942-NEXT: v_mov_b32_e32 v2, v6
; GFX942-NEXT: v_mov_b32_e32 v3, v7
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -2447,33 +2457,39 @@ define void @v_shuffle_v4p0_v2p0__3_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -2508,15 +2524,15 @@ define void @v_shuffle_v4p0_v2p0__3_u_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2526,15 +2542,15 @@ define void @v_shuffle_v4p0_v2p0__3_u_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -2577,17 +2593,17 @@ define void @v_shuffle_v4p0_v2p0__3_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: v_mov_b32_e32 v8, v2
; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: v_mov_b32_e32 v10, v2
+; GFX90A-NEXT: v_mov_b32_e32 v11, v3
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2597,21 +2613,21 @@ define void @v_shuffle_v4p0_v2p0__3_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
; GFX942-NEXT: v_mov_b32_e32 v8, v2
; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-NEXT: v_mov_b32_e32 v11, v3
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -2649,18 +2665,18 @@ define void @v_shuffle_v4p0_v2p0__3_1_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v12, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
; GFX90A-NEXT: v_mov_b32_e32 v10, v4
; GFX90A-NEXT: v_mov_b32_e32 v11, v5
+; GFX90A-NEXT: v_mov_b32_e32 v12, v4
+; GFX90A-NEXT: v_mov_b32_e32 v13, v5
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2670,19 +2686,19 @@ define void @v_shuffle_v4p0_v2p0__3_1_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v12, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
; GFX942-NEXT: v_mov_b32_e32 v10, v4
; GFX942-NEXT: v_mov_b32_e32 v11, v5
+; GFX942-NEXT: v_mov_b32_e32 v12, v4
+; GFX942-NEXT: v_mov_b32_e32 v13, v5
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -2717,15 +2733,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2735,15 +2751,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -2776,13 +2792,13 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2792,13 +2808,13 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -3134,33 +3150,33 @@ define void @v_shuffle_v4p0_v2p0__2_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v2p0__2_3_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p0_v2p0__2_3_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v2
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -3374,39 +3390,39 @@ define void @v_shuffle_v4p0_v2p0__3_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v2
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
index e6ac554735eee..09e497259766e 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
@@ -1126,15 +1126,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1144,15 +1144,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -1388,13 +1388,15 @@ define void @v_shuffle_v4p0_v3p0__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1404,13 +1406,15 @@ define void @v_shuffle_v4p0_v3p0__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -3637,33 +3641,33 @@ define void @v_shuffle_v4p0_v3p0__1_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_2_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
; GFX90A-NEXT: v_mov_b32_e32 v8, v4
; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v10, v4
+; GFX90A-NEXT: v_mov_b32_e32 v11, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p0_v3p0__1_2_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
; GFX942-NEXT: v_mov_b32_e32 v8, v4
; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v4
+; GFX942-NEXT: v_mov_b32_e32 v11, v5
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -4787,13 +4791,15 @@ define void @v_shuffle_v4p0_v3p0__4_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4803,13 +4809,15 @@ define void @v_shuffle_v4p0_v3p0__4_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -5169,15 +5177,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: v_mov_b32_e32 v8, v0
; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5187,15 +5195,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: v_mov_b32_e32 v8, v0
; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -5532,15 +5540,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5550,15 +5558,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -6255,17 +6263,17 @@ define void @v_shuffle_v4p0_v3p0__5_3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: v_mov_b32_e32 v8, v2
; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: v_mov_b32_e32 v10, v2
+; GFX90A-NEXT: v_mov_b32_e32 v11, v3
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6275,17 +6283,17 @@ define void @v_shuffle_v4p0_v3p0__5_3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: v_mov_b32_e32 v8, v2
; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-NEXT: v_mov_b32_e32 v11, v3
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -6970,33 +6978,33 @@ define void @v_shuffle_v4p0_v3p0__4_5_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v3p0__4_5_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
; GFX90A-NEXT: v_mov_b32_e32 v8, v4
; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v10, v4
+; GFX90A-NEXT: v_mov_b32_e32 v11, v5
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p0_v3p0__4_5_5_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
; GFX942-NEXT: v_mov_b32_e32 v8, v4
; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v4
+; GFX942-NEXT: v_mov_b32_e32 v11, v5
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -7344,15 +7352,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: v_mov_b32_e32 v8, v4
; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v4
+; GFX90A-NEXT: v_mov_b32_e32 v11, v5
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7362,15 +7370,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: v_mov_b32_e32 v8, v4
; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v4
+; GFX942-NEXT: v_mov_b32_e32 v11, v5
; GFX942-NEXT: v_mov_b32_e32 v0, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll
index ce1c54129f706..257af574366a6 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll
@@ -8328,15 +8328,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v12, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: v_mov_b32_e32 v10, v0
; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v12, v0
+; GFX90A-NEXT: v_mov_b32_e32 v13, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -8346,15 +8346,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v12, 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: v_mov_b32_e32 v10, v0
; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v12, v0
+; GFX942-NEXT: v_mov_b32_e32 v13, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr> asm "; def $0", "=v"()
@@ -11254,15 +11254,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v12, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: v_mov_b32_e32 v10, v4
; GFX90A-NEXT: v_mov_b32_e32 v11, v5
+; GFX90A-NEXT: v_mov_b32_e32 v12, v4
+; GFX90A-NEXT: v_mov_b32_e32 v13, v5
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -11272,15 +11272,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_6_6(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v12, 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: v_mov_b32_e32 v10, v4
; GFX942-NEXT: v_mov_b32_e32 v11, v5
+; GFX942-NEXT: v_mov_b32_e32 v12, v4
+; GFX942-NEXT: v_mov_b32_e32 v13, v5
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll
index 3b5690562c38a..90a1b99dc7c14 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll
@@ -272,27 +272,27 @@ define void @v_shuffle_v4p3_v2p3__3_2_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_2_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v2p3__3_2_u_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2386,28 +2386,29 @@ define void @v_shuffle_v4p3_v2p3__3_3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v3
-; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll
index 8039e126590b9..d13d26f638e0c 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll
@@ -255,15 +255,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -271,15 +271,16 @@ define void @v_shuffle_v4p3_v3p3__5_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -413,27 +414,27 @@ define void @v_shuffle_v4p3_v3p3__5_3_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -553,15 +554,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -569,16 +571,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -609,16 +612,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -626,17 +629,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -719,27 +722,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -766,28 +771,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1031,31 +1037,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_5_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1083,28 +1089,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_5_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1180,28 +1189,29 @@ define void @v_shuffle_v4p3_v3p3__u_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1325,31 +1335,31 @@ define void @v_shuffle_v4p3_v3p3__2_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__2_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1375,28 +1385,29 @@ define void @v_shuffle_v4p3_v3p3__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v2, v0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1488,15 +1499,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1506,15 +1517,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1545,34 +1556,33 @@ define void @v_shuffle_v4p3_v3p3__5_u_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1606,15 +1616,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1624,15 +1634,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1664,17 +1674,17 @@ define void @v_shuffle_v4p3_v3p3__5_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1682,17 +1692,17 @@ define void @v_shuffle_v4p3_v3p3__5_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1724,17 +1734,17 @@ define void @v_shuffle_v4p3_v3p3__5_3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v9, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[6:8]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v8
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1742,17 +1752,17 @@ define void @v_shuffle_v4p3_v3p3__5_3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[6:8]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v0, v8
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1843,15 +1853,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1861,15 +1871,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1901,16 +1911,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1918,16 +1928,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2018,17 +2028,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v6
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2036,17 +2046,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v6
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2078,17 +2088,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v9, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[6:8]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v8
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
-; GFX90A-NEXT: v_mov_b32_e32 v2, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2096,17 +2106,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[6:8]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: v_mov_b32_e32 v0, v8
-; GFX942-NEXT: v_mov_b32_e32 v1, v8
-; GFX942-NEXT: v_mov_b32_e32 v2, v6
-; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2603,16 +2613,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2620,16 +2631,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2661,16 +2673,17 @@ define void @v_shuffle_v4p3_v3p3__5_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2678,17 +2691,17 @@ define void @v_shuffle_v4p3_v3p3__5_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2720,16 +2733,17 @@ define void @v_shuffle_v4p3_v3p3__5_3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2737,16 +2751,17 @@ define void @v_shuffle_v4p3_v3p3__5_3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2834,16 +2849,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2851,17 +2867,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2893,15 +2909,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2909,16 +2926,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2949,15 +2966,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2965,16 +2984,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3006,16 +3026,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v6
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3023,17 +3044,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v6
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3065,16 +3086,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3082,17 +3104,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3124,16 +3146,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3141,17 +3164,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v5
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3584,17 +3607,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[6:8]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v8
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3602,17 +3625,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[6:8]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v8
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3871,16 +3894,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v6
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3888,16 +3911,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v6
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3929,16 +3952,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[6:8]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v8
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v5, v6
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3946,16 +3970,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[6:8]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: v_mov_b32_e32 v0, v8
-; GFX942-NEXT: v_mov_b32_e32 v1, v8
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v6
+; GFX942-NEXT: v_mov_b32_e32 v5, v6
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4045,36 +4070,37 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[6:8]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v8
-; GFX90A-NEXT: v_mov_b32_e32 v1, v8
-; GFX90A-NEXT: v_mov_b32_e32 v2, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v6
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v10, v4
+; GFX90A-NEXT: v_mov_b32_e32 v11, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[6:8]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: v_mov_b32_e32 v0, v8
-; GFX942-NEXT: v_mov_b32_e32 v1, v8
-; GFX942-NEXT: v_mov_b32_e32 v2, v6
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v6
+; GFX942-NEXT: v_mov_b32_e32 v9, v6
+; GFX942-NEXT: v_mov_b32_e32 v10, v4
+; GFX942-NEXT: v_mov_b32_e32 v11, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4358,31 +4384,31 @@ define void @v_shuffle_v4p3_v3p3__5_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4410,29 +4436,27 @@ define void @v_shuffle_v4p3_v3p3__5_u_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4463,17 +4487,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4481,17 +4505,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4580,36 +4604,35 @@ define void @v_shuffle_v4p3_v3p3__5_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v4
+; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_2_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v4
+; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4637,31 +4660,31 @@ define void @v_shuffle_v4p3_v3p3__5_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_4_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_4_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4689,31 +4712,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4741,28 +4764,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4793,16 +4817,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4810,17 +4835,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5376,28 +5401,27 @@ define void @v_shuffle_v4p3_v3p3__5_u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v1
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5428,17 +5452,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5446,17 +5470,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v5
-; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5543,36 +5567,35 @@ define void @v_shuffle_v4p3_v3p3__5_2_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_2_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v2, v5
-; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5599,28 +5622,31 @@ define void @v_shuffle_v4p3_v3p3__5_3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5648,28 +5674,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5697,27 +5726,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5749,16 +5780,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5766,17 +5798,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5808,17 +5840,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5826,17 +5858,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5922,27 +5954,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -6508,30 +6544,31 @@ define void @v_shuffle_v4p3_v3p3__5_3_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_5_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -6604,28 +6641,29 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
-; GFX942-NEXT: v_mov_b32_e32 v1, v2
-; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -6657,16 +6695,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v3, v6
-; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6674,17 +6713,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
-; GFX942-NEXT: v_mov_b32_e32 v1, v6
-; GFX942-NEXT: v_mov_b32_e32 v3, v6
-; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -6830,28 +6869,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, v4
-; GFX942-NEXT: v_mov_b32_e32 v1, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll
index eeab42ae40d7f..1684b94cfd452 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll
@@ -963,26 +963,29 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1009,26 +1012,29 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1440,31 +1446,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v3
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_7_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1734,28 +1740,31 @@ define void @v_shuffle_v4p3_v4p3__2_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__2_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2785,14 +2794,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v2
+; GFX90A-NEXT: v_mov_b32_e32 v11, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2802,14 +2812,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v11, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4138,14 +4149,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v2
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4155,14 +4167,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4196,14 +4209,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v3
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4213,14 +4227,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v10, v3
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5448,34 +5463,37 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v7
-; GFX90A-NEXT: v_mov_b32_e32 v8, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v10, v7
+; GFX90A-NEXT: v_mov_b32_e32 v11, v7
+; GFX90A-NEXT: v_mov_b32_e32 v12, v4
+; GFX90A-NEXT: v_mov_b32_e32 v13, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:7]
+; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:3]
+; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v7
-; GFX942-NEXT: v_mov_b32_e32 v8, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v10, v7
+; GFX942-NEXT: v_mov_b32_e32 v11, v7
+; GFX942-NEXT: v_mov_b32_e32 v12, v4
+; GFX942-NEXT: v_mov_b32_e32 v13, v2
+; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -7172,28 +7190,31 @@ define void @v_shuffle_v4p3_v4p3__6_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_4_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__6_4_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -7271,28 +7292,29 @@ define void @v_shuffle_v4p3_v4p3__7_u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -7559,30 +7581,31 @@ define void @v_shuffle_v4p3_v4p3__7_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -7660,28 +7683,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -7763,14 +7789,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v2
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7780,14 +7807,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v11, v2
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -8041,31 +8069,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_6_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -8946,28 +8974,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -9047,14 +9078,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v3
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -9064,14 +9096,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v11, v3
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -9105,14 +9138,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v5
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v1
+; GFX90A-NEXT: v_mov_b32_e32 v11, v3
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -9122,14 +9156,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v4, v5
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v11, v3
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -9277,28 +9312,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -9327,31 +9365,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_6_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -10319,15 +10357,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v5
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v4
-; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -10337,15 +10375,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_6(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v5
-; GFX942-NEXT: v_mov_b32_e32 v7, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v4
-; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-NEXT: v_mov_b32_e32 v11, v4
+; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -10553,31 +10591,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_6:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -11777,31 +11815,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -11830,31 +11868,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v3
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
+; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v3
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
index d2008be4fd32a..96b18593ea655 100644
--- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
+++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
@@ -16,19 +16,18 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b64 s[8:9], src_private_base
-; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
+; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_lg_u32 s68, -1
+; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: s_cselect_b32 s5, s9, 0
-; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_cselect_b32 s6, s68, 0
+; CHECK-NEXT: s_add_u32 s50, s34, 48
; CHECK-NEXT: v_mov_b32_e32 v57, s5
; CHECK-NEXT: s_mov_b32 s5, s4
-; CHECK-NEXT: s_add_u32 s50, s34, 48
-; CHECK-NEXT: v_accvgpr_write_b32 a33, s5
; CHECK-NEXT: s_addc_u32 s51, s35, 0
-; CHECK-NEXT: v_accvgpr_write_b32 a32, s4
+; CHECK-NEXT: v_pk_mov_b32 v[62:63], s[4:5], s[4:5] op_sel:[0,1]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, G at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, G at gotpcrel32@hi+12
@@ -48,13 +47,13 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_mov_b32 s52, s15
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
; CHECK-NEXT: v_mov_b32_e32 v40, v0
-; CHECK-NEXT: v_mov_b32_e32 v62, s66
-; CHECK-NEXT: v_mov_b32_e32 v63, s67
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33]
+; CHECK-NEXT: v_mov_b32_e32 v60, s66
+; CHECK-NEXT: v_mov_b32_e32 v61, s67
+; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63]
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
-; CHECK-NEXT: flat_load_dwordx2 v[60:61], v[58:59]
+; CHECK-NEXT: flat_load_dwordx2 a[32:33], v[58:59]
; CHECK-NEXT: v_mov_b32_e32 v44, 0
; CHECK-NEXT: v_mov_b32_e32 v45, 0x3ff00000
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -66,7 +65,7 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: flat_store_dwordx2 v[46:47], v[44:45]
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33]
+; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
@@ -75,9 +74,9 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: v_mov_b32_e32 v1, s67
; CHECK-NEXT: v_mov_b32_e32 v0, s68
; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, 0, v42
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61]
+; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63]
+; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61]
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index b045c761436de..644705e173b52 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -617,30 +617,30 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
; GFX942-LABEL: v8i8_multi_block:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v3
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3
+; GFX942-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v2, 3, v1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[6:7], v2, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB11_4
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[6:7], v4, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v3
+; GFX942-NEXT: global_load_dwordx2 v[4:5], v2, s[10:11]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v1
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB11_3
; GFX942-NEXT: ; %bb.2: ; %bb.2
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[12:13]
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: global_store_dwordx2 v1, v[6:7], s[12:13]
; GFX942-NEXT: .LBB11_3: ; %Flow
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: .LBB11_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[6:7], s[14:15]
+; GFX942-NEXT: global_store_dwordx2 v0, v[4:5], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
>From f797b9bb2b74c6f090067b4495cd10813709adbe Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 8 Oct 2025 15:36:16 -0700
Subject: [PATCH 3/4] Revert "Update lit tests"
This reverts commit 6d5273761c2659ecaf8f453f8c9def032aed145e.
---
.../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 144 +-
llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll | 406 +-
.../CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll | 81 +-
.../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll | 3680 +++---
.../AMDGPU/a-v-global-atomic-cmpxchg.ll | 28 +-
.../CodeGen/AMDGPU/a-v-global-atomicrmw.ll | 1070 +-
.../AMDGPU/agpr-copy-no-free-registers.ll | 12 +-
llvm/test/CodeGen/AMDGPU/agpr-csr.ll | 680 +-
llvm/test/CodeGen/AMDGPU/agpr-remat.ll | 16 +-
.../buffer-fat-pointer-atomicrmw-fadd.ll | 42 +-
.../buffer-fat-pointer-atomicrmw-fmax.ll | 30 +-
.../buffer-fat-pointer-atomicrmw-fmin.ll | 30 +-
.../AMDGPU/buffer-fat-pointers-memcpy.ll | 307 +-
llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll | 422 +-
.../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 4 +
.../CodeGen/AMDGPU/global-i16-load-store.ll | 12 +-
.../AMDGPU/illegal-sgpr-to-vgpr-copy.ll | 7 +-
.../AMDGPU/lds-dma-workgroup-release.ll | 24 +-
.../AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll | 8 +-
.../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 12 +-
.../llvm.amdgcn.image.atomic.dim.gfx90a.ll | 34 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll | 96 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll | 12 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 80 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll | 476 +-
.../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll | 146 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 1236 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll | 41 +-
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 1420 +-
....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 3668 ++----
...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 10777 ++++++----------
.../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 2446 ++--
...gcn.struct.ptr.buffer.atomic.fadd_nortn.ll | 4 +
...mdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll | 16 +-
...uffer-fat-pointers-nontemporal-metadata.ll | 24 +-
llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll | 2000 +--
llvm/test/CodeGen/AMDGPU/mfma-loop.ll | 1321 +-
.../AMDGPU/mfma-no-register-aliasing.ll | 604 +-
.../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll | 51 +-
.../CodeGen/AMDGPU/no-fold-accvgpr-mov.mir | 26 +-
.../CodeGen/AMDGPU/no-fold-accvgpr-read.mir | 4 +-
.../AMDGPU/preserve-wwm-copy-dst-reg.ll | 2 +-
.../AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll | 2 +
.../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll | 119 +-
.../AMDGPU/shufflevector-physreg-copy.ll | 12 +-
.../AMDGPU/shufflevector.v2f32.v3f32.ll | 28 +-
.../AMDGPU/shufflevector.v2i32.v3i32.ll | 28 +-
.../AMDGPU/shufflevector.v2i64.v2i64.ll | 40 +-
.../CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll | 40 +-
.../CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll | 28 +-
.../AMDGPU/shufflevector.v3f32.v2f32.ll | 391 +-
.../AMDGPU/shufflevector.v3f32.v3f32.ll | 200 +-
.../AMDGPU/shufflevector.v3f32.v4f32.ll | 377 +-
.../AMDGPU/shufflevector.v3i32.v2i32.ll | 391 +-
.../AMDGPU/shufflevector.v3i32.v3i32.ll | 200 +-
.../AMDGPU/shufflevector.v3i32.v4i32.ll | 377 +-
.../AMDGPU/shufflevector.v3i64.v2i64.ll | 92 +-
.../CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll | 92 +-
.../CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll | 391 +-
.../CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll | 200 +-
.../CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll | 377 +-
.../AMDGPU/shufflevector.v4f32.v2f32.ll | 45 +-
.../AMDGPU/shufflevector.v4f32.v3f32.ll | 1474 +--
.../AMDGPU/shufflevector.v4f32.v4f32.ll | 518 +-
.../AMDGPU/shufflevector.v4i32.v2i32.ll | 45 +-
.../AMDGPU/shufflevector.v4i32.v3i32.ll | 1474 +--
.../AMDGPU/shufflevector.v4i32.v4i32.ll | 518 +-
.../AMDGPU/shufflevector.v4i64.v2i64.ll | 364 +-
.../AMDGPU/shufflevector.v4i64.v3i64.ll | 204 +-
.../AMDGPU/shufflevector.v4i64.v4i64.ll | 40 +-
.../CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll | 364 +-
.../CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll | 204 +-
.../CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll | 40 +-
.../CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll | 45 +-
.../CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll | 1474 +--
.../CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll | 518 +-
.../AMDGPU/undef-handling-crash-in-ra.ll | 23 +-
.../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 22 +-
78 files changed, 18502 insertions(+), 23754 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 9f1955c78eb36..7e297f46a780e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -23,9 +23,9 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v0, s8
-; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_add_noret_f64:
@@ -34,9 +34,9 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_buffer_atomic_add_noret_f64:
@@ -142,9 +142,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8)
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v0, s8
-; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
@@ -153,9 +153,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8)
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
@@ -261,9 +261,9 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v0, s8
-; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_add_noret_f64:
@@ -272,9 +272,9 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_buffer_atomic_add_noret_f64:
@@ -379,9 +379,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v0, s8
-; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
@@ -390,9 +390,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
@@ -497,9 +497,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v0, s8
-; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_min_noret_f64:
@@ -508,9 +508,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_buffer_atomic_min_noret_f64:
@@ -616,9 +616,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v0, s8
-; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
@@ -627,9 +627,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
@@ -735,9 +735,9 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v0, s8
-; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_min_noret_f64:
@@ -746,9 +746,9 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_buffer_atomic_min_noret_f64:
@@ -853,9 +853,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v0, s8
-; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
@@ -864,9 +864,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
@@ -971,9 +971,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v0, s8
-; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_max_noret_f64:
@@ -982,9 +982,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_buffer_atomic_max_noret_f64:
@@ -1090,9 +1090,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v0, s8
-; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
@@ -1101,9 +1101,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
@@ -1209,9 +1209,9 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v0, s8
-; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_max_noret_f64:
@@ -1220,9 +1220,9 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_buffer_atomic_max_noret_f64:
@@ -1327,9 +1327,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v0, s8
-; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
@@ -1338,9 +1338,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll
index 2968e0441d349..4c62409a85c00 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll
@@ -183,125 +183,122 @@ define void @ds_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(3) %ptr) #0 {
; CHECK-LABEL: ds_atomic_xchg_i32_ret_av_av_no_agprs:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v40 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v41 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a4, v42 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a5, v43 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a6, v44 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a7, v45 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a8, v46 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a9, v47 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a10, v56 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a11, v57 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a12, v58 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a13, v59 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a14, v60 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a15, v61 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a16, v62 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a17, v63 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[0:31]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_write_b32 a33, v31
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a1
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_write_b32 a32, v30
-; CHECK-NEXT: v_accvgpr_write_b32 a31, v29
-; CHECK-NEXT: v_accvgpr_write_b32 a30, v28
-; CHECK-NEXT: v_accvgpr_write_b32 a29, v27
-; CHECK-NEXT: v_accvgpr_write_b32 a28, v26
-; CHECK-NEXT: v_accvgpr_write_b32 a27, v25
-; CHECK-NEXT: v_accvgpr_write_b32 a26, v24
-; CHECK-NEXT: v_accvgpr_write_b32 a25, v23
-; CHECK-NEXT: v_accvgpr_write_b32 a24, v22
-; CHECK-NEXT: v_accvgpr_write_b32 a23, v21
-; CHECK-NEXT: v_accvgpr_write_b32 a22, v20
-; CHECK-NEXT: v_accvgpr_write_b32 a21, v19
-; CHECK-NEXT: v_accvgpr_write_b32 a20, v18
-; CHECK-NEXT: v_accvgpr_write_b32 a19, v17
-; CHECK-NEXT: v_accvgpr_write_b32 a18, v16
-; CHECK-NEXT: v_accvgpr_write_b32 a17, v15
-; CHECK-NEXT: v_accvgpr_write_b32 a16, v14
-; CHECK-NEXT: v_accvgpr_write_b32 a15, v13
-; CHECK-NEXT: v_accvgpr_write_b32 a14, v12
-; CHECK-NEXT: v_accvgpr_write_b32 a13, v11
-; CHECK-NEXT: v_accvgpr_write_b32 a12, v10
-; CHECK-NEXT: v_accvgpr_write_b32 a11, v9
-; CHECK-NEXT: v_accvgpr_write_b32 a10, v8
-; CHECK-NEXT: v_accvgpr_write_b32 a9, v7
-; CHECK-NEXT: v_accvgpr_write_b32 a8, v6
-; CHECK-NEXT: v_accvgpr_write_b32 a7, v5
-; CHECK-NEXT: v_accvgpr_write_b32 a6, v4
-; CHECK-NEXT: v_accvgpr_write_b32 a5, v3
-; CHECK-NEXT: v_accvgpr_write_b32 a4, v2
-; CHECK-NEXT: v_accvgpr_write_b32 a3, v1
-; CHECK-NEXT: v_accvgpr_write_b32 a2, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a18, v31 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
; CHECK-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a31, v18 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a30, v19 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a29, v20 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a28, v21 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a27, v22 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a26, v23 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a25, v24 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a24, v25 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a23, v26 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a22, v27 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a21, v28 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a20, v29 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a19, v30 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
-; CHECK-NEXT: v_accvgpr_read_b32 v0, a2
-; CHECK-NEXT: v_accvgpr_read_b32 v1, a3
-; CHECK-NEXT: v_accvgpr_read_b32 v2, a4
-; CHECK-NEXT: v_accvgpr_read_b32 v3, a5
-; CHECK-NEXT: v_accvgpr_read_b32 v4, a6
-; CHECK-NEXT: v_accvgpr_read_b32 v5, a7
-; CHECK-NEXT: v_accvgpr_read_b32 v6, a8
-; CHECK-NEXT: v_accvgpr_read_b32 v7, a9
-; CHECK-NEXT: v_accvgpr_read_b32 v8, a10
-; CHECK-NEXT: v_accvgpr_read_b32 v9, a11
-; CHECK-NEXT: v_accvgpr_read_b32 v10, a12
-; CHECK-NEXT: v_accvgpr_read_b32 v11, a13
-; CHECK-NEXT: v_accvgpr_read_b32 v12, a14
-; CHECK-NEXT: v_accvgpr_read_b32 v13, a15
-; CHECK-NEXT: v_accvgpr_read_b32 v14, a16
-; CHECK-NEXT: v_accvgpr_read_b32 v15, a17
-; CHECK-NEXT: v_accvgpr_read_b32 v16, a18
-; CHECK-NEXT: v_accvgpr_read_b32 v17, a19
-; CHECK-NEXT: v_accvgpr_read_b32 v18, a20
-; CHECK-NEXT: v_accvgpr_read_b32 v19, a21
-; CHECK-NEXT: v_accvgpr_read_b32 v20, a22
-; CHECK-NEXT: v_accvgpr_read_b32 v21, a23
-; CHECK-NEXT: v_accvgpr_read_b32 v22, a24
-; CHECK-NEXT: v_accvgpr_read_b32 v23, a25
-; CHECK-NEXT: v_accvgpr_read_b32 v24, a26
-; CHECK-NEXT: v_accvgpr_read_b32 v25, a27
-; CHECK-NEXT: v_accvgpr_read_b32 v26, a28
-; CHECK-NEXT: v_accvgpr_read_b32 v27, a29
-; CHECK-NEXT: v_accvgpr_read_b32 v28, a30
-; CHECK-NEXT: v_accvgpr_read_b32 v29, a31
-; CHECK-NEXT: v_accvgpr_read_b32 v30, a32
-; CHECK-NEXT: v_accvgpr_read_b32 v31, a33
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use v[0:31]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; CHECK-NEXT: v_accvgpr_read_b32 v18, a31 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v19, a30 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v20, a29 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v21, a28 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v22, a27 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v23, a26 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v24, a25 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v25, a24 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v26, a23 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v27, a22 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v28, a21 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v29, a20 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v30, a19 ; Reload Reuse
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_accvgpr_read_b32 v31, a18 ; Reload Reuse
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v63, a17 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v62, a16 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v61, a15 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v60, a14 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v59, a13 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v58, a12 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v57, a11 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v56, a10 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v47, a9 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v46, a8 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v45, a7 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v44, a6 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v43, a5 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v42, a4 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v41, a3 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v40, a2 ; Reload Reuse
; CHECK-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
%data = call i32 asm "; def $0", "=^VA"()
@@ -747,125 +744,122 @@ define void @ds_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(3) %ptr) #0 {
; CHECK-LABEL: ds_atomic_xor_i32_ret_av_av_no_agprs:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v40 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v41 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a4, v42 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a5, v43 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a6, v44 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a7, v45 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a8, v46 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a9, v47 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a10, v56 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a11, v57 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a12, v58 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a13, v59 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a14, v60 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a15, v61 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a16, v62 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a17, v63 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[0:31]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_write_b32 a33, v31
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a1
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_write_b32 a32, v30
-; CHECK-NEXT: v_accvgpr_write_b32 a31, v29
-; CHECK-NEXT: v_accvgpr_write_b32 a30, v28
-; CHECK-NEXT: v_accvgpr_write_b32 a29, v27
-; CHECK-NEXT: v_accvgpr_write_b32 a28, v26
-; CHECK-NEXT: v_accvgpr_write_b32 a27, v25
-; CHECK-NEXT: v_accvgpr_write_b32 a26, v24
-; CHECK-NEXT: v_accvgpr_write_b32 a25, v23
-; CHECK-NEXT: v_accvgpr_write_b32 a24, v22
-; CHECK-NEXT: v_accvgpr_write_b32 a23, v21
-; CHECK-NEXT: v_accvgpr_write_b32 a22, v20
-; CHECK-NEXT: v_accvgpr_write_b32 a21, v19
-; CHECK-NEXT: v_accvgpr_write_b32 a20, v18
-; CHECK-NEXT: v_accvgpr_write_b32 a19, v17
-; CHECK-NEXT: v_accvgpr_write_b32 a18, v16
-; CHECK-NEXT: v_accvgpr_write_b32 a17, v15
-; CHECK-NEXT: v_accvgpr_write_b32 a16, v14
-; CHECK-NEXT: v_accvgpr_write_b32 a15, v13
-; CHECK-NEXT: v_accvgpr_write_b32 a14, v12
-; CHECK-NEXT: v_accvgpr_write_b32 a13, v11
-; CHECK-NEXT: v_accvgpr_write_b32 a12, v10
-; CHECK-NEXT: v_accvgpr_write_b32 a11, v9
-; CHECK-NEXT: v_accvgpr_write_b32 a10, v8
-; CHECK-NEXT: v_accvgpr_write_b32 a9, v7
-; CHECK-NEXT: v_accvgpr_write_b32 a8, v6
-; CHECK-NEXT: v_accvgpr_write_b32 a7, v5
-; CHECK-NEXT: v_accvgpr_write_b32 a6, v4
-; CHECK-NEXT: v_accvgpr_write_b32 a5, v3
-; CHECK-NEXT: v_accvgpr_write_b32 a4, v2
-; CHECK-NEXT: v_accvgpr_write_b32 a3, v1
-; CHECK-NEXT: v_accvgpr_write_b32 a2, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a18, v31 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
; CHECK-NEXT: ds_xor_rtn_b32 v0, v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a31, v18 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a30, v19 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a29, v20 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a28, v21 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a27, v22 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a26, v23 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a25, v24 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a24, v25 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a23, v26 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a22, v27 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a21, v28 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a20, v29 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a19, v30 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
-; CHECK-NEXT: v_accvgpr_read_b32 v0, a2
-; CHECK-NEXT: v_accvgpr_read_b32 v1, a3
-; CHECK-NEXT: v_accvgpr_read_b32 v2, a4
-; CHECK-NEXT: v_accvgpr_read_b32 v3, a5
-; CHECK-NEXT: v_accvgpr_read_b32 v4, a6
-; CHECK-NEXT: v_accvgpr_read_b32 v5, a7
-; CHECK-NEXT: v_accvgpr_read_b32 v6, a8
-; CHECK-NEXT: v_accvgpr_read_b32 v7, a9
-; CHECK-NEXT: v_accvgpr_read_b32 v8, a10
-; CHECK-NEXT: v_accvgpr_read_b32 v9, a11
-; CHECK-NEXT: v_accvgpr_read_b32 v10, a12
-; CHECK-NEXT: v_accvgpr_read_b32 v11, a13
-; CHECK-NEXT: v_accvgpr_read_b32 v12, a14
-; CHECK-NEXT: v_accvgpr_read_b32 v13, a15
-; CHECK-NEXT: v_accvgpr_read_b32 v14, a16
-; CHECK-NEXT: v_accvgpr_read_b32 v15, a17
-; CHECK-NEXT: v_accvgpr_read_b32 v16, a18
-; CHECK-NEXT: v_accvgpr_read_b32 v17, a19
-; CHECK-NEXT: v_accvgpr_read_b32 v18, a20
-; CHECK-NEXT: v_accvgpr_read_b32 v19, a21
-; CHECK-NEXT: v_accvgpr_read_b32 v20, a22
-; CHECK-NEXT: v_accvgpr_read_b32 v21, a23
-; CHECK-NEXT: v_accvgpr_read_b32 v22, a24
-; CHECK-NEXT: v_accvgpr_read_b32 v23, a25
-; CHECK-NEXT: v_accvgpr_read_b32 v24, a26
-; CHECK-NEXT: v_accvgpr_read_b32 v25, a27
-; CHECK-NEXT: v_accvgpr_read_b32 v26, a28
-; CHECK-NEXT: v_accvgpr_read_b32 v27, a29
-; CHECK-NEXT: v_accvgpr_read_b32 v28, a30
-; CHECK-NEXT: v_accvgpr_read_b32 v29, a31
-; CHECK-NEXT: v_accvgpr_read_b32 v30, a32
-; CHECK-NEXT: v_accvgpr_read_b32 v31, a33
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use v[0:31]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; CHECK-NEXT: v_accvgpr_read_b32 v18, a31 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v19, a30 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v20, a29 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v21, a28 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v22, a27 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v23, a26 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v24, a25 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v25, a24 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v26, a23 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v27, a22 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v28, a21 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v29, a20 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v30, a19 ; Reload Reuse
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_accvgpr_read_b32 v31, a18 ; Reload Reuse
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v63, a17 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v62, a16 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v61, a15 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v60, a14 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v59, a13 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v58, a12 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v57, a11 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v56, a10 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v47, a9 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v46, a8 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v45, a7 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v44, a6 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v43, a5 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v42, a4 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v41, a3 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v40, a2 ; Reload Reuse
; CHECK-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
%data = call i32 asm "; def $0", "=^VA"()
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
index e9192ca2d03ac..bc341f2baa804 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
@@ -472,46 +472,49 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__a(ptr %ptr) #0 {
; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_av_av__a:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
-; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
+; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[2:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: ; implicit-def: $agpr0_agpr1
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
; CHECK-NEXT: buffer_wbl2
-; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_invl2
; CHECK-NEXT: buffer_wbinvl1_vol
-; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
; CHECK-NEXT: .LBB14_2: ; %Flow
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB14_4
; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; CHECK-NEXT: .LBB14_4: ; %atomicrmw.phi
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: v_accvgpr_write_b32 a0, v4
-; CHECK-NEXT: v_accvgpr_write_b32 a1, v5
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -530,50 +533,53 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__a(ptr %ptr) #0 {
; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_a_a__a:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
-; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
-; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; CHECK-NEXT: ; implicit-def: $agpr0_agpr1
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB15_2
; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
; CHECK-NEXT: buffer_wbl2
-; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_invl2
; CHECK-NEXT: buffer_wbinvl1_vol
-; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
; CHECK-NEXT: .LBB15_2: ; %Flow
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB15_4
; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; CHECK-NEXT: .LBB15_4: ; %atomicrmw.phi
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: v_accvgpr_write_b32 a0, v4
-; CHECK-NEXT: v_accvgpr_write_b32 a1, v5
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -768,46 +774,49 @@ define void @flat_atomic_cmpxchg_i64_ret_v_v__a(ptr %ptr) #0 {
; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_v_v__a:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
-; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
+; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[2:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: ; implicit-def: $agpr0_agpr1
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB19_2
; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
; CHECK-NEXT: buffer_wbl2
-; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_invl2
; CHECK-NEXT: buffer_wbinvl1_vol
-; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
; CHECK-NEXT: .LBB19_2: ; %Flow
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB19_4
; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
; CHECK-NEXT: .LBB19_4: ; %atomicrmw.phi
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: v_accvgpr_write_b32 a0, v4
-; CHECK-NEXT: v_accvgpr_write_b32 a1, v5
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:1]
; CHECK-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index 4a8225fcd6ad2..d053425afbb6d 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -338,264 +338,225 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_write_b32 a33, v31
-; GFX90A-NEXT: v_accvgpr_write_b32 a32, v30
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v29
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v28
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v27
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v26
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v25
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v23
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v22
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v21
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v20
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v19
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v18
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v17
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v16
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v15
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v14
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v13
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v12
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v11
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v10
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v9
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v8
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v7
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v6
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v5
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v4
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a2
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def a34
-; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a34
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a2
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a3
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a4
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a5
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a6
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a7
-; GFX90A-NEXT: v_accvgpr_read_b32 v6, a8
-; GFX90A-NEXT: v_accvgpr_read_b32 v7, a9
-; GFX90A-NEXT: v_accvgpr_read_b32 v8, a10
-; GFX90A-NEXT: v_accvgpr_read_b32 v9, a11
-; GFX90A-NEXT: v_accvgpr_read_b32 v10, a12
-; GFX90A-NEXT: v_accvgpr_read_b32 v11, a13
-; GFX90A-NEXT: v_accvgpr_read_b32 v12, a14
-; GFX90A-NEXT: v_accvgpr_read_b32 v13, a15
-; GFX90A-NEXT: v_accvgpr_read_b32 v14, a16
-; GFX90A-NEXT: v_accvgpr_read_b32 v15, a17
-; GFX90A-NEXT: v_accvgpr_read_b32 v16, a18
-; GFX90A-NEXT: v_accvgpr_read_b32 v17, a19
-; GFX90A-NEXT: v_accvgpr_read_b32 v18, a20
-; GFX90A-NEXT: v_accvgpr_read_b32 v19, a21
-; GFX90A-NEXT: v_accvgpr_read_b32 v20, a22
-; GFX90A-NEXT: v_accvgpr_read_b32 v21, a23
-; GFX90A-NEXT: v_accvgpr_read_b32 v22, a24
-; GFX90A-NEXT: v_accvgpr_read_b32 v23, a25
-; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26
-; GFX90A-NEXT: v_accvgpr_read_b32 v25, a27
-; GFX90A-NEXT: v_accvgpr_read_b32 v26, a28
-; GFX90A-NEXT: v_accvgpr_read_b32 v27, a29
-; GFX90A-NEXT: v_accvgpr_read_b32 v28, a30
-; GFX90A-NEXT: v_accvgpr_read_b32 v29, a31
-; GFX90A-NEXT: v_accvgpr_read_b32 v30, a32
-; GFX90A-NEXT: v_accvgpr_read_b32 v31, a33
-; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[0:31]
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use v[0:31]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill
+; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill
+; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill
+; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def a34
+; GFX950-NEXT: ; def a2
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_accvgpr_write_b32 a33, v31
-; GFX950-NEXT: v_accvgpr_write_b32 a32, v30
-; GFX950-NEXT: v_accvgpr_write_b32 a31, v29
-; GFX950-NEXT: v_accvgpr_write_b32 a30, v28
-; GFX950-NEXT: v_accvgpr_write_b32 a29, v27
-; GFX950-NEXT: v_accvgpr_write_b32 a28, v26
-; GFX950-NEXT: v_accvgpr_write_b32 a27, v25
-; GFX950-NEXT: v_accvgpr_write_b32 a26, v24
-; GFX950-NEXT: v_accvgpr_write_b32 a25, v23
-; GFX950-NEXT: v_accvgpr_write_b32 a24, v22
-; GFX950-NEXT: v_accvgpr_write_b32 a23, v21
-; GFX950-NEXT: v_accvgpr_write_b32 a22, v20
-; GFX950-NEXT: v_accvgpr_write_b32 a21, v19
-; GFX950-NEXT: v_accvgpr_write_b32 a20, v18
-; GFX950-NEXT: v_accvgpr_write_b32 a19, v17
-; GFX950-NEXT: v_accvgpr_write_b32 a18, v16
-; GFX950-NEXT: v_accvgpr_write_b32 a17, v15
-; GFX950-NEXT: v_accvgpr_write_b32 a16, v14
-; GFX950-NEXT: v_accvgpr_write_b32 a15, v13
-; GFX950-NEXT: v_accvgpr_write_b32 a14, v12
-; GFX950-NEXT: v_accvgpr_write_b32 a13, v11
-; GFX950-NEXT: v_accvgpr_write_b32 a12, v10
-; GFX950-NEXT: v_accvgpr_write_b32 a11, v9
-; GFX950-NEXT: v_accvgpr_write_b32 a10, v8
-; GFX950-NEXT: v_accvgpr_write_b32 a9, v7
-; GFX950-NEXT: v_accvgpr_write_b32 a8, v6
-; GFX950-NEXT: v_accvgpr_write_b32 a7, v5
-; GFX950-NEXT: v_accvgpr_write_b32 a6, v4
-; GFX950-NEXT: v_accvgpr_write_b32 a5, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a4, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a3, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a34
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
+; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a2
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a3
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a4
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a5
-; GFX950-NEXT: v_accvgpr_read_b32 v4, a6
-; GFX950-NEXT: v_accvgpr_read_b32 v5, a7
-; GFX950-NEXT: v_accvgpr_read_b32 v6, a8
-; GFX950-NEXT: v_accvgpr_read_b32 v7, a9
-; GFX950-NEXT: v_accvgpr_read_b32 v8, a10
-; GFX950-NEXT: v_accvgpr_read_b32 v9, a11
-; GFX950-NEXT: v_accvgpr_read_b32 v10, a12
-; GFX950-NEXT: v_accvgpr_read_b32 v11, a13
-; GFX950-NEXT: v_accvgpr_read_b32 v12, a14
-; GFX950-NEXT: v_accvgpr_read_b32 v13, a15
-; GFX950-NEXT: v_accvgpr_read_b32 v14, a16
-; GFX950-NEXT: v_accvgpr_read_b32 v15, a17
-; GFX950-NEXT: v_accvgpr_read_b32 v16, a18
-; GFX950-NEXT: v_accvgpr_read_b32 v17, a19
-; GFX950-NEXT: v_accvgpr_read_b32 v18, a20
-; GFX950-NEXT: v_accvgpr_read_b32 v19, a21
-; GFX950-NEXT: v_accvgpr_read_b32 v20, a22
-; GFX950-NEXT: v_accvgpr_read_b32 v21, a23
-; GFX950-NEXT: v_accvgpr_read_b32 v22, a24
-; GFX950-NEXT: v_accvgpr_read_b32 v23, a25
-; GFX950-NEXT: v_accvgpr_read_b32 v24, a26
-; GFX950-NEXT: v_accvgpr_read_b32 v25, a27
-; GFX950-NEXT: v_accvgpr_read_b32 v26, a28
-; GFX950-NEXT: v_accvgpr_read_b32 v27, a29
-; GFX950-NEXT: v_accvgpr_read_b32 v28, a30
-; GFX950-NEXT: v_accvgpr_read_b32 v29, a31
-; GFX950-NEXT: v_accvgpr_read_b32 v30, a32
-; GFX950-NEXT: v_accvgpr_read_b32 v31, a33
-; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[0:31]
-; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload
+; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload
+; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload
+; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload
+; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
+; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload
+; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; use v[0:31]
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
@@ -679,43 +640,43 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB11_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB11_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB11_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB11_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -727,39 +688,39 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def a[0:1]
+; GFX950-NEXT: ; def a[2:3]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
-; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB11_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
+; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
+; GFX950-NEXT: ; implicit-def: $agpr2_agpr3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB11_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB11_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: scratch_store_dwordx2 v2, a[0:1], off
+; GFX950-NEXT: scratch_store_dwordx2 v0, a[2:3], off
; GFX950-NEXT: .LBB11_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -871,41 +832,41 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_v_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB13_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB13_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB13_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB13_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -917,37 +878,37 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[4:5]
+; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB13_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
+; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB13_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB13_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off
+; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off
; GFX950-NEXT: .LBB13_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -965,40 +926,41 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB14_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB14_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB14_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB14_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[2:3]
+; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1054,40 +1016,41 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_v:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB15_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB15_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB15_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB15_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[2:3]
+; GFX90A-NEXT: ; use v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1143,40 +1106,41 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB16_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] glc
+; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB16_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB16_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB16_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(2)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -1188,37 +1152,37 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[4:5]
+; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB16_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
+; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB16_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB16_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off
+; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off
; GFX950-NEXT: .LBB16_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -1420,10 +1384,12 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB19_3
@@ -1440,13 +1406,14 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB19_2
; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_store_dword a1, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_store_dword a0, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -1516,6 +1483,7 @@ define void @flat_atomic_xchg_i64_noret_av(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB20_2
; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.private
@@ -1592,12 +1560,12 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1622,12 +1590,12 @@ define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB21_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -1727,12 +1695,12 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1756,12 +1724,12 @@ define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB23_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -1924,12 +1892,12 @@ define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1953,12 +1921,12 @@ define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB26_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -2526,7 +2494,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB32_4
@@ -2544,7 +2512,9 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB32_2
@@ -2558,18 +2528,18 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB32_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -2586,7 +2556,7 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB32_4
@@ -2603,7 +2573,9 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB32_2
@@ -2618,15 +2590,15 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7
-; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7
+; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB32_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -2770,7 +2742,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB34_4
@@ -2788,7 +2760,9 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB34_2
@@ -2802,18 +2776,18 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 {
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB34_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -2828,7 +2802,7 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB34_4
@@ -2845,7 +2819,9 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB34_2
@@ -2860,15 +2836,15 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7
-; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7
+; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB34_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -3125,7 +3101,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[6:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB37_4
@@ -3143,7 +3119,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB37_2
@@ -3157,18 +3135,18 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 {
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB37_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -3183,7 +3161,7 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB37_4
@@ -3200,7 +3178,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB37_2
@@ -3215,15 +3195,15 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_xor_b32_e32 v1, v3, v7
-; GFX950-NEXT: v_xor_b32_e32 v0, v2, v6
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7
+; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB37_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -4028,262 +4008,223 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_write_b32 a33, v31
-; GFX90A-NEXT: v_accvgpr_write_b32 a32, v30
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v29
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v28
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v27
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v26
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v25
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v23
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v22
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v21
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v20
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v19
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v18
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v17
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v16
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v15
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v14
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v13
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v12
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v11
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v10
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v9
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v8
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v7
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v6
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v5
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v4
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a2
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def a34
-; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a34
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a2
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a3
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a4
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a5
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a6
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a7
-; GFX90A-NEXT: v_accvgpr_read_b32 v6, a8
-; GFX90A-NEXT: v_accvgpr_read_b32 v7, a9
-; GFX90A-NEXT: v_accvgpr_read_b32 v8, a10
-; GFX90A-NEXT: v_accvgpr_read_b32 v9, a11
-; GFX90A-NEXT: v_accvgpr_read_b32 v10, a12
-; GFX90A-NEXT: v_accvgpr_read_b32 v11, a13
-; GFX90A-NEXT: v_accvgpr_read_b32 v12, a14
-; GFX90A-NEXT: v_accvgpr_read_b32 v13, a15
-; GFX90A-NEXT: v_accvgpr_read_b32 v14, a16
-; GFX90A-NEXT: v_accvgpr_read_b32 v15, a17
-; GFX90A-NEXT: v_accvgpr_read_b32 v16, a18
-; GFX90A-NEXT: v_accvgpr_read_b32 v17, a19
-; GFX90A-NEXT: v_accvgpr_read_b32 v18, a20
-; GFX90A-NEXT: v_accvgpr_read_b32 v19, a21
-; GFX90A-NEXT: v_accvgpr_read_b32 v20, a22
-; GFX90A-NEXT: v_accvgpr_read_b32 v21, a23
-; GFX90A-NEXT: v_accvgpr_read_b32 v22, a24
-; GFX90A-NEXT: v_accvgpr_read_b32 v23, a25
-; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26
-; GFX90A-NEXT: v_accvgpr_read_b32 v25, a27
-; GFX90A-NEXT: v_accvgpr_read_b32 v26, a28
-; GFX90A-NEXT: v_accvgpr_read_b32 v27, a29
-; GFX90A-NEXT: v_accvgpr_read_b32 v28, a30
-; GFX90A-NEXT: v_accvgpr_read_b32 v29, a31
-; GFX90A-NEXT: v_accvgpr_read_b32 v30, a32
-; GFX90A-NEXT: v_accvgpr_read_b32 v31, a33
-; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[0:31]
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use v[0:31]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill
+; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill
+; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill
+; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def a34
+; GFX950-NEXT: ; def a2
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_accvgpr_write_b32 a33, v31
-; GFX950-NEXT: v_accvgpr_write_b32 a32, v30
-; GFX950-NEXT: v_accvgpr_write_b32 a31, v29
-; GFX950-NEXT: v_accvgpr_write_b32 a30, v28
-; GFX950-NEXT: v_accvgpr_write_b32 a29, v27
-; GFX950-NEXT: v_accvgpr_write_b32 a28, v26
-; GFX950-NEXT: v_accvgpr_write_b32 a27, v25
-; GFX950-NEXT: v_accvgpr_write_b32 a26, v24
-; GFX950-NEXT: v_accvgpr_write_b32 a25, v23
-; GFX950-NEXT: v_accvgpr_write_b32 a24, v22
-; GFX950-NEXT: v_accvgpr_write_b32 a23, v21
-; GFX950-NEXT: v_accvgpr_write_b32 a22, v20
-; GFX950-NEXT: v_accvgpr_write_b32 a21, v19
-; GFX950-NEXT: v_accvgpr_write_b32 a20, v18
-; GFX950-NEXT: v_accvgpr_write_b32 a19, v17
-; GFX950-NEXT: v_accvgpr_write_b32 a18, v16
-; GFX950-NEXT: v_accvgpr_write_b32 a17, v15
-; GFX950-NEXT: v_accvgpr_write_b32 a16, v14
-; GFX950-NEXT: v_accvgpr_write_b32 a15, v13
-; GFX950-NEXT: v_accvgpr_write_b32 a14, v12
-; GFX950-NEXT: v_accvgpr_write_b32 a13, v11
-; GFX950-NEXT: v_accvgpr_write_b32 a12, v10
-; GFX950-NEXT: v_accvgpr_write_b32 a11, v9
-; GFX950-NEXT: v_accvgpr_write_b32 a10, v8
-; GFX950-NEXT: v_accvgpr_write_b32 a9, v7
-; GFX950-NEXT: v_accvgpr_write_b32 a8, v6
-; GFX950-NEXT: v_accvgpr_write_b32 a7, v5
-; GFX950-NEXT: v_accvgpr_write_b32 a6, v4
-; GFX950-NEXT: v_accvgpr_write_b32 a5, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a4, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a3, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a34
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
+; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a2
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a3
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a4
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a5
-; GFX950-NEXT: v_accvgpr_read_b32 v4, a6
-; GFX950-NEXT: v_accvgpr_read_b32 v5, a7
-; GFX950-NEXT: v_accvgpr_read_b32 v6, a8
-; GFX950-NEXT: v_accvgpr_read_b32 v7, a9
-; GFX950-NEXT: v_accvgpr_read_b32 v8, a10
-; GFX950-NEXT: v_accvgpr_read_b32 v9, a11
-; GFX950-NEXT: v_accvgpr_read_b32 v10, a12
-; GFX950-NEXT: v_accvgpr_read_b32 v11, a13
-; GFX950-NEXT: v_accvgpr_read_b32 v12, a14
-; GFX950-NEXT: v_accvgpr_read_b32 v13, a15
-; GFX950-NEXT: v_accvgpr_read_b32 v14, a16
-; GFX950-NEXT: v_accvgpr_read_b32 v15, a17
-; GFX950-NEXT: v_accvgpr_read_b32 v16, a18
-; GFX950-NEXT: v_accvgpr_read_b32 v17, a19
-; GFX950-NEXT: v_accvgpr_read_b32 v18, a20
-; GFX950-NEXT: v_accvgpr_read_b32 v19, a21
-; GFX950-NEXT: v_accvgpr_read_b32 v20, a22
-; GFX950-NEXT: v_accvgpr_read_b32 v21, a23
-; GFX950-NEXT: v_accvgpr_read_b32 v22, a24
-; GFX950-NEXT: v_accvgpr_read_b32 v23, a25
-; GFX950-NEXT: v_accvgpr_read_b32 v24, a26
-; GFX950-NEXT: v_accvgpr_read_b32 v25, a27
-; GFX950-NEXT: v_accvgpr_read_b32 v26, a28
-; GFX950-NEXT: v_accvgpr_read_b32 v27, a29
-; GFX950-NEXT: v_accvgpr_read_b32 v28, a30
-; GFX950-NEXT: v_accvgpr_read_b32 v29, a31
-; GFX950-NEXT: v_accvgpr_read_b32 v30, a32
-; GFX950-NEXT: v_accvgpr_read_b32 v31, a33
-; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[0:31]
-; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload
+; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload
+; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload
+; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload
+; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
+; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload
+; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; use v[0:31]
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
@@ -4367,37 +4308,39 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB53_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB53_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB53_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4
-; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
+; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3
+; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB53_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -4412,35 +4355,37 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
-; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB53_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc1
-; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0
+; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB53_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB53_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5
-; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4
-; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3
+; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB53_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -4552,36 +4497,38 @@ define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB55_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB55_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB55_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4
-; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
+; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3
+; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB55_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -4594,35 +4541,37 @@ define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[4:5]
+; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB55_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc1
-; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0
+; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB55_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB55_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5
-; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4
-; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3
+; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB55_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -4817,36 +4766,38 @@ define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:5]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB58_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc
+; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB58_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB58_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4
-; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
+; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3
+; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB58_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -4859,35 +4810,37 @@ define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[4:5]
+; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB58_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc1
-; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0
+; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc1
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB58_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB58_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5
-; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4
-; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3
+; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB58_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -5487,13 +5440,13 @@ define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB69_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -5516,12 +5469,12 @@ define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB69_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6109,13 +6062,13 @@ define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB85_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -6140,13 +6093,13 @@ define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB85_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6240,12 +6193,12 @@ define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB87_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -6268,12 +6221,12 @@ define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB87_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6352,45 +6305,48 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_add_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB89_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] glc
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB89_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB89_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v1, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v0, v4
-; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc
-; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
+; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB89_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_add_i64_ret_a_a:
@@ -6398,41 +6354,43 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
-; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB89_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB89_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB89_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5]
-; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
-; GFX950-NEXT: .LBB89_4: ; %atomicrmw.phi
-; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: .LBB89_4: ; %atomicrmw.phi
+; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -6530,45 +6488,48 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_sub_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB91_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] glc
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB91_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB91_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v1, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
-; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc
-; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
+; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB91_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_sub_i64_ret_a_a:
@@ -6576,43 +6537,45 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
-; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB91_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB91_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB91_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
-; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
-; GFX950-NEXT: .LBB91_4: ; %atomicrmw.phi
-; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: .LBB91_4: ; %atomicrmw.phi
+; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -6712,45 +6675,48 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_and_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB93_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] glc
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB93_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB93_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5
-; GFX90A-NEXT: v_and_b32_e32 v4, v0, v4
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_and_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
+; GFX90A-NEXT: v_and_b32_e32 v3, v1, v3
+; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB93_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_and_i64_ret_a_a:
@@ -6758,42 +6724,44 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
-; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB93_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB93_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB93_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_and_b32_e32 v3, v1, v5
-; GFX950-NEXT: v_and_b32_e32 v2, v0, v4
-; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
-; GFX950-NEXT: .LBB93_4: ; %atomicrmw.phi
-; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_and_b32_e32 v3, v1, v3
+; GFX950-NEXT: v_and_b32_e32 v2, v0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: .LBB93_4: ; %atomicrmw.phi
+; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -6901,7 +6869,7 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB95_4
@@ -6918,6 +6886,8 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -6931,21 +6901,21 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_cbranch_execz .LBB95_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_and_b32_e32 v3, v1, v7
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_and_b32_e32 v4, v0, v6
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
-; GFX90A-NEXT: v_not_b32_e32 v3, v3
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_and_b32_e32 v4, v2, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_not_b32_e32 v2, v3
+; GFX90A-NEXT: v_not_b32_e32 v3, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB95_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -6964,7 +6934,7 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB95_4
@@ -6981,6 +6951,8 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -7000,13 +6972,13 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v2, v1, v7
; GFX950-NEXT: v_and_b32_e32 v5, v0, v6
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_not_b32_e32 v3, v2
; GFX950-NEXT: v_not_b32_e32 v2, v5
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB95_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -7146,45 +7118,48 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_or_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB97_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] glc
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB97_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB97_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_or_b32_e32 v3, v1, v5
-; GFX90A-NEXT: v_or_b32_e32 v4, v0, v4
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
+; GFX90A-NEXT: v_or_b32_e32 v3, v1, v3
+; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB97_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_or_i64_ret_a_a:
@@ -7192,42 +7167,44 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
-; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB97_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB97_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB97_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_or_b32_e32 v3, v1, v5
-; GFX950-NEXT: v_or_b32_e32 v2, v0, v4
-; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
-; GFX950-NEXT: .LBB97_4: ; %atomicrmw.phi
-; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_or_b32_e32 v3, v1, v3
+; GFX950-NEXT: v_or_b32_e32 v2, v0, v2
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: .LBB97_4: ; %atomicrmw.phi
+; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -7332,40 +7309,43 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB99_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] glc
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB99_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB99_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB99_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_max_i64_ret_a_a:
@@ -7373,44 +7353,46 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB99_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB99_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB99_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB99_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -7518,40 +7500,43 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB101_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] glc
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB101_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB101_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB101_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_min_i64_ret_a_a:
@@ -7559,44 +7544,46 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB101_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB101_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB101_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB101_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -7704,40 +7691,43 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB103_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] glc
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB103_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB103_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB103_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umax_i64_ret_a_a:
@@ -7745,44 +7735,46 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB103_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB103_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB103_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB103_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -7890,40 +7882,43 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB105_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] glc
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB105_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB105_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB105_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umin_i64_ret_a_a:
@@ -7931,44 +7926,46 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB105_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB105_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB105_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
; GFX950-NEXT: .LBB105_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -8076,42 +8073,45 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB107_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[4:5] glc
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB107_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB107_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc
-; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc
+; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB107_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_uinc_wrap_i64_ret_a_a:
@@ -8119,45 +8119,46 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
-; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB107_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB107_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB107_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
-; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
+; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB107_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -8262,50 +8263,53 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_udec_wrap_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB109_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB109_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB109_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
+; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
+; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1]
; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
-; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB109_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_udec_wrap_i64_ret_a_a:
@@ -8313,46 +8317,48 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB109_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB109_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB109_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3]
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1]
+; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1
; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1]
-; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off
; GFX950-NEXT: .LBB109_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -8460,62 +8466,64 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_usub_cond_i64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB111_4
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: .LBB111_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6
-; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc
-; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[6:7]
+; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4
+; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB111_2
; GFX90A-NEXT: ; %bb.3: ; %Flow
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB111_4: ; %Flow3
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB111_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v6
+; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v7, vcc
-; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[6:7]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc
-; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
+; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc
+; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB111_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -8534,7 +8542,7 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB111_4
@@ -8554,6 +8562,8 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -8575,14 +8585,13 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB111_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -8739,7 +8748,7 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB113_4
@@ -8757,6 +8766,8 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -8778,14 +8789,14 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB113_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -8804,7 +8815,7 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB113_4
@@ -8824,6 +8835,8 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -8845,14 +8858,13 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB113_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -9010,53 +9022,55 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: ; implicit-def: $agpr0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB115_6
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: ; implicit-def: $agpr0
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB115_3
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
-; GFX90A-NEXT: global_atomic_add_f32 v2, v[0:1], v3, off glc
+; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr3
; GFX90A-NEXT: .LBB115_3: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB115_5
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f32_e32 v1, v2, v3
-; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_add_f32_e32 v2, v1, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
+; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: .LBB115_5: ; %Flow1
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr2
; GFX90A-NEXT: .LBB115_6: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB115_8
; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: ds_add_rtn_f32 v2, v0, v3
+; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: .LBB115_8: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_f32_ret_a_a:
@@ -9175,12 +9189,12 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB117_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9203,12 +9217,12 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB117_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -9298,13 +9312,13 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB119_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9328,13 +9342,13 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB119_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -9428,13 +9442,13 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB121_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9458,13 +9472,13 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB121_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -9559,13 +9573,13 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB123_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9588,12 +9602,12 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB123_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -9687,13 +9701,13 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB125_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9716,12 +9730,12 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB125_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -9803,63 +9817,68 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fadd_f64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB127_6
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v3
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB127_3
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
-; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off glc
+; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB127_3: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB127_5
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5]
-; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB127_5: ; %Flow1
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: .LBB127_6: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB127_8
; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc
-; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: .LBB127_8: ; %atomicrmw.phi
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: .LBB127_8: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_f64_ret_a_a:
@@ -9867,61 +9886,65 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
-; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB127_6
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v3
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX950-NEXT: s_cbranch_execz .LBB127_3
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global
-; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off sc0
+; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB127_3: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3]
; GFX950-NEXT: s_cbranch_execz .LBB127_5
; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5]
-; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB127_5: ; %Flow1
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB127_6: ; %Flow2
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB127_8
; GFX950-NEXT: ; %bb.7: ; %atomicrmw.shared
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc
-; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: .LBB127_8: ; %atomicrmw.phi
-; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: .LBB127_8: ; %atomicrmw.phi
+; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
@@ -10066,7 +10089,7 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB129_4
@@ -10080,7 +10103,9 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB129_2
@@ -10096,14 +10121,15 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB129_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -10122,7 +10148,7 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB129_4
@@ -10136,7 +10162,9 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB129_2
@@ -10153,12 +10181,12 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off
; GFX950-NEXT: .LBB129_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -10285,46 +10313,49 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_f64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB131_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3], v[4:5] glc
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB131_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB131_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB131_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_f64_ret_a_a:
@@ -10332,43 +10363,45 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
-; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB131_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_max_f64 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB131_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB131_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX950-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB131_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
@@ -10469,46 +10502,49 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_f64_ret_a_a:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB133_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3], v[4:5] glc
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: .LBB133_2: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB133_4
; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB133_4: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_f64_ret_a_a:
@@ -10516,43 +10552,45 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[2:3], 0x50
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3]
+; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
-; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB133_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: flat_atomic_min_f64 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB133_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB133_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX950-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX950-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB133_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
@@ -10662,7 +10700,7 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB135_4
@@ -10680,6 +10718,8 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -10697,17 +10737,18 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB135_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -10726,7 +10767,7 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB135_4
@@ -10745,6 +10786,8 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -10765,14 +10808,13 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB135_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -10926,7 +10968,7 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB137_4
@@ -10944,6 +10986,8 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -10961,17 +11005,18 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB137_6: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -10990,7 +11035,7 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB137_4
@@ -11009,6 +11054,8 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -11029,14 +11076,13 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off
; GFX950-NEXT: .LBB137_6: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -11199,12 +11245,12 @@ define void @flat_atomic_fadd_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB139_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11296,12 +11342,12 @@ define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB141_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11324,12 +11370,12 @@ define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB141_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -11419,13 +11465,13 @@ define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB143_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11450,13 +11496,13 @@ define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB143_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -11551,13 +11597,13 @@ define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB145_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11582,13 +11628,13 @@ define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB145_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -11688,13 +11734,13 @@ define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB147_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11717,12 +11763,12 @@ define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB147_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -11824,13 +11870,13 @@ define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB149_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11853,12 +11899,12 @@ define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB149_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -11975,13 +12021,13 @@ define void @flat_atomic_fadd_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB151_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12108,13 +12154,13 @@ define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB153_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12142,13 +12188,13 @@ define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB153_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12278,13 +12324,13 @@ define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB155_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12312,13 +12358,13 @@ define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB155_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12448,13 +12494,13 @@ define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB157_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12482,13 +12528,13 @@ define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB157_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12623,13 +12669,13 @@ define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB159_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12657,13 +12703,13 @@ define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB159_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12803,13 +12849,13 @@ define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB161_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12837,13 +12883,13 @@ define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB161_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13282,13 +13328,13 @@ define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB171_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13312,12 +13358,12 @@ define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB171_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14043,13 +14089,13 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB189_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14075,13 +14121,13 @@ define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB189_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14178,12 +14224,12 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB191_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14207,12 +14253,12 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB191_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14309,26 +14355,28 @@ define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB193_3
; GFX90A-NEXT: s_branch .LBB193_4
; GFX90A-NEXT: .LBB193_2:
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB193_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: buffer_load_dword a0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword a1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB193_4: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: s_waitcnt vmcnt(2)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xchg_i64_saddr_ret_a_a:
@@ -14350,23 +14398,25 @@ define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a3, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a2, v0
; GFX950-NEXT: s_cbranch_execz .LBB193_3
; GFX950-NEXT: s_branch .LBB193_4
; GFX950-NEXT: .LBB193_2:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr2_agpr3
; GFX950-NEXT: .LBB193_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 a[2:3], off, s0
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: scratch_store_dwordx2 off, a[0:1], s0
; GFX950-NEXT: .LBB193_4: ; %atomicrmw.end
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: s_waitcnt vmcnt(1)
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use a[0:1]
+; GFX950-NEXT: ; use a[2:3]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -14473,28 +14523,32 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB195_3
; GFX90A-NEXT: s_branch .LBB195_4
; GFX90A-NEXT: .LBB195_2:
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB195_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB195_4: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_add_i64_saddr_ret_a_a:
@@ -14516,24 +14570,27 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB195_3
; GFX950-NEXT: s_branch .LBB195_4
; GFX950-NEXT: .LBB195_2:
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB195_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB195_4: ; %atomicrmw.end
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -14643,28 +14700,32 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB197_3
; GFX90A-NEXT: s_branch .LBB197_4
; GFX90A-NEXT: .LBB197_2:
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB197_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
-; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4
+; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB197_4: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_sub_i64_saddr_ret_a_a:
@@ -14678,34 +14739,37 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB197_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB197_3
; GFX950-NEXT: s_branch .LBB197_4
; GFX950-NEXT: .LBB197_2:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB197_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
+; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB197_4: ; %atomicrmw.end
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -14817,28 +14881,32 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB199_3
; GFX90A-NEXT: s_branch .LBB199_4
; GFX90A-NEXT: .LBB199_2:
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB199_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_and_b32_e32 v0, v4, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
; GFX90A-NEXT: v_and_b32_e32 v1, v3, v1
-; GFX90A-NEXT: v_and_b32_e32 v0, v2, v0
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB199_4: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_and_i64_saddr_ret_a_a:
@@ -14852,33 +14920,36 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB199_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB199_3
; GFX950-NEXT: s_branch .LBB199_4
; GFX950-NEXT: .LBB199_2:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB199_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_and_b32_e32 v3, v1, v3
-; GFX950-NEXT: v_and_b32_e32 v2, v0, v2
-; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_and_b32_e32 v1, v3, v1
+; GFX950-NEXT: v_and_b32_e32 v0, v2, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB199_4: ; %atomicrmw.end
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -14999,6 +15070,8 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -15007,25 +15080,25 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB201_6
; GFX90A-NEXT: .LBB201_4:
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_cbranch_execz .LBB201_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_and_b32_e32 v4, v0, v4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
-; GFX90A-NEXT: v_not_b32_e32 v3, v3
-; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: .LBB201_6: ; %atomicrmw.phi
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_and_b32_e32 v4, v2, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_not_b32_e32 v2, v3
+; GFX90A-NEXT: v_not_b32_e32 v3, v4
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: .LBB201_6: ; %atomicrmw.phi
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -15061,6 +15134,8 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -15069,7 +15144,7 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB201_6
; GFX950-NEXT: .LBB201_4:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_cbranch_execz .LBB201_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
@@ -15078,12 +15153,12 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v2, v1, v5
; GFX950-NEXT: v_and_b32_e32 v4, v0, v4
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_not_b32_e32 v3, v2
; GFX950-NEXT: v_not_b32_e32 v2, v4
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB201_6: ; %atomicrmw.phi
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -15235,28 +15310,32 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB203_3
; GFX90A-NEXT: s_branch .LBB203_4
; GFX90A-NEXT: .LBB203_2:
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB203_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
; GFX90A-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX90A-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB203_4: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_or_i64_saddr_ret_a_a:
@@ -15270,33 +15349,36 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB203_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB203_3
; GFX950-NEXT: s_branch .LBB203_4
; GFX950-NEXT: .LBB203_2:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB203_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX950-NEXT: v_or_b32_e32 v2, v0, v2
-; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX950-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB203_4: ; %atomicrmw.end
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -15407,28 +15489,32 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB205_3
; GFX90A-NEXT: s_branch .LBB205_4
; GFX90A-NEXT: .LBB205_2:
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB205_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_xor_b32_e32 v0, v4, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v1
-; GFX90A-NEXT: v_xor_b32_e32 v0, v2, v0
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB205_4: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_xor_i64_saddr_ret_a_a:
@@ -15442,33 +15528,36 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB205_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB205_3
; GFX950-NEXT: s_branch .LBB205_4
; GFX950-NEXT: .LBB205_2:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB205_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3
-; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2
-; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_xor_b32_e32 v1, v3, v1
+; GFX950-NEXT: v_xor_b32_e32 v0, v2, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB205_4: ; %atomicrmw.end
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -15579,29 +15668,33 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB207_3
; GFX90A-NEXT: s_branch .LBB207_4
; GFX90A-NEXT: .LBB207_2:
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB207_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB207_4: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_max_i64_saddr_ret_a_a:
@@ -15615,35 +15708,38 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB207_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB207_3
; GFX950-NEXT: s_branch .LBB207_4
; GFX950-NEXT: .LBB207_2:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB207_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
+; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB207_4: ; %atomicrmw.end
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -15757,29 +15853,33 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB209_3
; GFX90A-NEXT: s_branch .LBB209_4
; GFX90A-NEXT: .LBB209_2:
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB209_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB209_4: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_min_i64_saddr_ret_a_a:
@@ -15793,35 +15893,38 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB209_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB209_3
; GFX950-NEXT: s_branch .LBB209_4
; GFX950-NEXT: .LBB209_2:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB209_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
+; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB209_4: ; %atomicrmw.end
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -15935,29 +16038,33 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB211_3
; GFX90A-NEXT: s_branch .LBB211_4
; GFX90A-NEXT: .LBB211_2:
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB211_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB211_4: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umax_i64_saddr_ret_a_a:
@@ -15971,35 +16078,38 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB211_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB211_3
; GFX950-NEXT: s_branch .LBB211_4
; GFX950-NEXT: .LBB211_2:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB211_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
+; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB211_4: ; %atomicrmw.end
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -16113,29 +16223,33 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB213_3
; GFX90A-NEXT: s_branch .LBB213_4
; GFX90A-NEXT: .LBB213_2:
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB213_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB213_4: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_umin_i64_saddr_ret_a_a:
@@ -16149,35 +16263,38 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB213_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB213_3
; GFX950-NEXT: s_branch .LBB213_4
; GFX950-NEXT: .LBB213_2:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB213_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
-; GFX950-NEXT: .LBB213_4: ; %atomicrmw.end
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT: .LBB213_4: ; %atomicrmw.end
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -16283,39 +16400,43 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB215_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB215_3
; GFX90A-NEXT: s_branch .LBB215_4
; GFX90A-NEXT: .LBB215_2:
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB215_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
-; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc
+; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
+; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB215_4: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_uinc_wrap_i64_saddr_ret_a_a:
@@ -16329,36 +16450,38 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB215_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB215_3
; GFX950-NEXT: s_branch .LBB215_4
; GFX950-NEXT: .LBB215_2:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB215_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
-; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
+; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, 1
+; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB215_4: ; %atomicrmw.end
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -16475,33 +16598,37 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB217_3
; GFX90A-NEXT: s_branch .LBB217_4
; GFX90A-NEXT: .LBB217_2:
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB217_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1]
; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
-; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
; GFX90A-NEXT: .LBB217_4: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_udec_wrap_i64_saddr_ret_a_a:
@@ -16515,37 +16642,40 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; def a[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
; GFX950-NEXT: s_cbranch_vccz .LBB217_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] sc0
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB217_3
; GFX950-NEXT: s_branch .LBB217_4
; GFX950-NEXT: .LBB217_2:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB217_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s2, s0, -1
-; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s2
+; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3]
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
+; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1]
+; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1
; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1]
-; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s2
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s2
; GFX950-NEXT: .LBB217_4: ; %atomicrmw.end
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
%data = call i64 asm "; def $0", "=a"()
@@ -16676,6 +16806,8 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -16684,7 +16816,7 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB219_6
; GFX90A-NEXT: .LBB219_4:
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_cbranch_execz .LBB219_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
@@ -16697,13 +16829,13 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc
; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc
-; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB219_6: ; %atomicrmw.phi
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -16742,6 +16874,8 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -16750,7 +16884,7 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB219_6
; GFX950-NEXT: .LBB219_4:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_cbranch_execz .LBB219_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
@@ -16761,13 +16895,12 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB219_6: ; %atomicrmw.phi
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -16938,6 +17071,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -16946,7 +17081,7 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB221_6
; GFX90A-NEXT: .LBB221_4:
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_cbranch_execz .LBB221_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
@@ -16959,13 +17094,13 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB221_6: ; %atomicrmw.phi
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -17004,6 +17139,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -17012,7 +17149,7 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB221_6
; GFX950-NEXT: .LBB221_4:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_cbranch_execz .LBB221_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
@@ -17023,13 +17160,12 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB221_6: ; %atomicrmw.phi
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -17197,36 +17333,38 @@ define void @flat_atomic_fadd_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v0, s[4:5] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
; GFX90A-NEXT: s_cbranch_execz .LBB223_5
; GFX90A-NEXT: s_branch .LBB223_6
; GFX90A-NEXT: .LBB223_3:
-; GFX90A-NEXT: ; implicit-def: $vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0
; GFX90A-NEXT: s_branch .LBB223_7
; GFX90A-NEXT: .LBB223_4:
-; GFX90A-NEXT: ; implicit-def: $vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0
; GFX90A-NEXT: .LBB223_5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s6, s4, -1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s6
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_mov_b32_e32 v1, s6
+; GFX90A-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f32_e32 v3, v1, v0
-; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_add_f32_e32 v3, v2, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
; GFX90A-NEXT: .LBB223_6: ; %Flow1
; GFX90A-NEXT: s_cbranch_execnz .LBB223_8
; GFX90A-NEXT: .LBB223_7: ; %atomicrmw.shared
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s4
-; GFX90A-NEXT: ds_add_rtn_f32 v1, v1, v0
+; GFX90A-NEXT: ds_add_rtn_f32 v0, v1, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: .LBB223_8: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_f32_saddr_ret_a_a:
@@ -17347,12 +17485,12 @@ define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB225_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -17376,12 +17514,12 @@ define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB225_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -17459,29 +17597,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB227_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB227_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -17491,29 +17629,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_max_f32_e32 v4, v0, v0
-; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB227_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX950-NEXT: v_max_f32_e32 v0, v0, v4
-; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
+; GFX950-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX950-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB227_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -17597,29 +17735,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB229_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB229_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -17629,29 +17767,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_max_f32_e32 v4, v0, v0
-; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB229_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX950-NEXT: v_min_f32_e32 v0, v0, v4
-; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
+; GFX950-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX950-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB229_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -17735,30 +17873,29 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB231_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v0, v1, v4
-; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4
+; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB231_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -17782,12 +17919,12 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB231_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -17868,30 +18005,29 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB233_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_min_f32_e32 v0, v1, v4
-; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
+; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4
+; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB233_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -17915,12 +18051,12 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB233_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -18025,21 +18161,27 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[4:5] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB235_5
; GFX90A-NEXT: s_branch .LBB235_6
; GFX90A-NEXT: .LBB235_3:
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_branch .LBB235_7
; GFX90A-NEXT: .LBB235_4:
-; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB235_5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s6, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s6
; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB235_6: ; %Flow1
@@ -18047,17 +18189,16 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: .LBB235_7: ; %atomicrmw.shared
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1]
+; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: .LBB235_8: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_f64_saddr_ret_a_a:
@@ -18084,36 +18225,40 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX950-NEXT: v_mov_b32_e32 v2, 0
; GFX950-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB235_5
; GFX950-NEXT: s_branch .LBB235_6
; GFX950-NEXT: .LBB235_3:
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_branch .LBB235_7
; GFX950-NEXT: .LBB235_4:
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB235_5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s2, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2
; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[4:5], s2
; GFX950-NEXT: .LBB235_6: ; %Flow1
; GFX950-NEXT: s_cbranch_execnz .LBB235_8
; GFX950-NEXT: .LBB235_7: ; %atomicrmw.shared
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
-; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v2, s0
-; GFX950-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1]
+; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1]
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: .LBB235_8: ; %atomicrmw.end
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
@@ -18263,7 +18408,9 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB237_2
@@ -18271,7 +18418,7 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB237_6
; GFX90A-NEXT: .LBB237_4:
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_cbranch_execz .LBB237_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
@@ -18279,13 +18426,14 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB237_6: ; %atomicrmw.phi
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -18318,7 +18466,9 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB237_2
@@ -18326,18 +18476,18 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB237_6
; GFX950-NEXT: .LBB237_4:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_cbranch_execz .LBB237_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0
; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB237_6: ; %atomicrmw.phi
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -18468,37 +18618,41 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB239_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB239_3
; GFX90A-NEXT: s_branch .LBB239_4
; GFX90A-NEXT: .LBB239_2:
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB239_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
-; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
-; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB239_4: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmax_f64_saddr_ret_a_a:
@@ -18520,10 +18674,13 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB239_3
; GFX950-NEXT: s_branch .LBB239_4
; GFX950-NEXT: .LBB239_2:
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB239_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
@@ -18531,15 +18688,15 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB239_4: ; %atomicrmw.end
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
@@ -18644,37 +18801,41 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: s_cbranch_vccz .LBB241_2
; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] glc
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_cbranch_execz .LBB241_3
; GFX90A-NEXT: s_branch .LBB241_4
; GFX90A-NEXT: .LBB241_2:
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: .LBB241_3: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX90A-NEXT: s_cselect_b32 s4, s4, -1
; GFX90A-NEXT: v_mov_b32_e32 v6, s4
-; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
-; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
-; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB241_4: ; %atomicrmw.end
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fmin_f64_saddr_ret_a_a:
@@ -18696,10 +18857,13 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX950-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] sc0
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_cbranch_execz .LBB241_3
; GFX950-NEXT: s_branch .LBB241_4
; GFX950-NEXT: .LBB241_2:
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: .LBB241_3: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX950-NEXT: s_cselect_b32 s0, s0, -1
@@ -18707,15 +18871,15 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
; GFX950-NEXT: .LBB241_4: ; %atomicrmw.end
-; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
%data = call double asm "; def $0", "=a"()
@@ -18839,6 +19003,8 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -18847,7 +19013,7 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB243_6
; GFX90A-NEXT: .LBB243_4:
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_cbranch_execz .LBB243_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
@@ -18856,16 +19022,17 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB243_6: ; %atomicrmw.phi
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -18903,6 +19070,8 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -18911,7 +19080,7 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB243_6
; GFX950-NEXT: .LBB243_4:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_cbranch_execz .LBB243_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
@@ -18921,13 +19090,12 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB243_6: ; %atomicrmw.phi
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -19095,6 +19263,8 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
@@ -19103,7 +19273,7 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_branch .LBB245_6
; GFX90A-NEXT: .LBB245_4:
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1
; GFX90A-NEXT: s_cbranch_execz .LBB245_6
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0
@@ -19112,16 +19282,17 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
+; GFX90A-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
; GFX90A-NEXT: .LBB245_6: ; %atomicrmw.phi
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -19159,6 +19330,8 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
@@ -19167,7 +19340,7 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_branch .LBB245_6
; GFX950-NEXT: .LBB245_4:
-; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_cbranch_execz .LBB245_6
; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private
; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0
@@ -19177,13 +19350,12 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0
; GFX950-NEXT: .LBB245_6: ; %atomicrmw.phi
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -19343,12 +19515,12 @@ define void @flat_atomic_fadd_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB247_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -19446,12 +19618,12 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB249_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -19475,12 +19647,12 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB249_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -19558,29 +19730,29 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB251_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
-; GFX90A-NEXT: v_pk_max_f16 v0, v0, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB251_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -19590,30 +19762,30 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
-; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB251_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_max_f16 v0, v0, v4
-; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
+; GFX950-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB251_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -19698,29 +19870,29 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
; GFX90A-NEXT: .LBB253_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
-; GFX90A-NEXT: v_pk_min_f16 v0, v0, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
+; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB253_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -19730,30 +19902,30 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
-; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-NEXT: .LBB253_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX950-NEXT: v_pk_max_f16 v2, v3, v3
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_min_f16 v0, v0, v4
-; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
+; GFX950-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX950-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB253_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -19859,13 +20031,13 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB255_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -19889,12 +20061,12 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB255_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -20000,13 +20172,13 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB257_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20030,12 +20202,12 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB257_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -20156,13 +20328,13 @@ define void @flat_atomic_fadd_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB259_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20297,13 +20469,13 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB261_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20333,13 +20505,13 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB261_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -20475,13 +20647,13 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB263_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20511,13 +20683,13 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB263_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -20653,13 +20825,13 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB265_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20689,13 +20861,13 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB265_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -20836,13 +21008,13 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB267_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -20872,13 +21044,13 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB267_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -21024,13 +21196,13 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB269_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -21060,13 +21232,13 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB269_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll
index 37a44d8b4b7d1..063feec759efa 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll
@@ -449,13 +449,13 @@ define void @global_atomic_cmpxchg_i64_ret_a_a__a(ptr addrspace(1) %ptr) #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def a[2:3]
+; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
-; CHECK-NEXT: v_accvgpr_read_b32 v3, a3
-; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
-; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
; CHECK-NEXT: buffer_wbl2
; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -483,13 +483,13 @@ define void @global_atomic_cmpxchg_i64_ret_a_a__v(ptr addrspace(1) %ptr) #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def a[2:3]
+; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
-; CHECK-NEXT: v_accvgpr_read_b32 v3, a3
-; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
-; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
; CHECK-NEXT: buffer_wbl2
; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -515,8 +515,8 @@ define void @global_atomic_cmpxchg_i64_ret_v_a__v(ptr addrspace(1) %ptr) #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[4:5]
; CHECK-NEXT: ;;#ASMEND
@@ -545,8 +545,8 @@ define void @global_atomic_cmpxchg_i64_ret_a_v__v(ptr addrspace(1) %ptr) #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[2:3]
; CHECK-NEXT: ;;#ASMEND
@@ -661,8 +661,8 @@ define void @global_atomic_cmpxchg_i64_ret_av_a__av(ptr addrspace(1) %ptr) #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[4:5]
; CHECK-NEXT: ;;#ASMEND
@@ -691,8 +691,8 @@ define void @global_atomic_cmpxchg_i64_ret_a_av__av(ptr addrspace(1) %ptr) #0 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[2:3]
; CHECK-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
index c54421ae64528..c98fff96d7b8a 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
@@ -338,264 +338,225 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_write_b32 a33, v31
-; GFX90A-NEXT: v_accvgpr_write_b32 a32, v30
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v29
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v28
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v27
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v26
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v25
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v23
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v22
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v21
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v20
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v19
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v18
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v17
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v16
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v15
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v14
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v13
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v12
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v11
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v10
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v9
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v8
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v7
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v6
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v5
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v4
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a2
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def a34
-; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a34
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a2
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a3
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a4
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a5
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a6
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a7
-; GFX90A-NEXT: v_accvgpr_read_b32 v6, a8
-; GFX90A-NEXT: v_accvgpr_read_b32 v7, a9
-; GFX90A-NEXT: v_accvgpr_read_b32 v8, a10
-; GFX90A-NEXT: v_accvgpr_read_b32 v9, a11
-; GFX90A-NEXT: v_accvgpr_read_b32 v10, a12
-; GFX90A-NEXT: v_accvgpr_read_b32 v11, a13
-; GFX90A-NEXT: v_accvgpr_read_b32 v12, a14
-; GFX90A-NEXT: v_accvgpr_read_b32 v13, a15
-; GFX90A-NEXT: v_accvgpr_read_b32 v14, a16
-; GFX90A-NEXT: v_accvgpr_read_b32 v15, a17
-; GFX90A-NEXT: v_accvgpr_read_b32 v16, a18
-; GFX90A-NEXT: v_accvgpr_read_b32 v17, a19
-; GFX90A-NEXT: v_accvgpr_read_b32 v18, a20
-; GFX90A-NEXT: v_accvgpr_read_b32 v19, a21
-; GFX90A-NEXT: v_accvgpr_read_b32 v20, a22
-; GFX90A-NEXT: v_accvgpr_read_b32 v21, a23
-; GFX90A-NEXT: v_accvgpr_read_b32 v22, a24
-; GFX90A-NEXT: v_accvgpr_read_b32 v23, a25
-; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26
-; GFX90A-NEXT: v_accvgpr_read_b32 v25, a27
-; GFX90A-NEXT: v_accvgpr_read_b32 v26, a28
-; GFX90A-NEXT: v_accvgpr_read_b32 v27, a29
-; GFX90A-NEXT: v_accvgpr_read_b32 v28, a30
-; GFX90A-NEXT: v_accvgpr_read_b32 v29, a31
-; GFX90A-NEXT: v_accvgpr_read_b32 v30, a32
-; GFX90A-NEXT: v_accvgpr_read_b32 v31, a33
-; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[0:31]
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use v[0:31]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_xchg_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill
+; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill
+; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill
+; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def a34
+; GFX950-NEXT: ; def a2
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_accvgpr_write_b32 a33, v31
-; GFX950-NEXT: v_accvgpr_write_b32 a32, v30
-; GFX950-NEXT: v_accvgpr_write_b32 a31, v29
-; GFX950-NEXT: v_accvgpr_write_b32 a30, v28
-; GFX950-NEXT: v_accvgpr_write_b32 a29, v27
-; GFX950-NEXT: v_accvgpr_write_b32 a28, v26
-; GFX950-NEXT: v_accvgpr_write_b32 a27, v25
-; GFX950-NEXT: v_accvgpr_write_b32 a26, v24
-; GFX950-NEXT: v_accvgpr_write_b32 a25, v23
-; GFX950-NEXT: v_accvgpr_write_b32 a24, v22
-; GFX950-NEXT: v_accvgpr_write_b32 a23, v21
-; GFX950-NEXT: v_accvgpr_write_b32 a22, v20
-; GFX950-NEXT: v_accvgpr_write_b32 a21, v19
-; GFX950-NEXT: v_accvgpr_write_b32 a20, v18
-; GFX950-NEXT: v_accvgpr_write_b32 a19, v17
-; GFX950-NEXT: v_accvgpr_write_b32 a18, v16
-; GFX950-NEXT: v_accvgpr_write_b32 a17, v15
-; GFX950-NEXT: v_accvgpr_write_b32 a16, v14
-; GFX950-NEXT: v_accvgpr_write_b32 a15, v13
-; GFX950-NEXT: v_accvgpr_write_b32 a14, v12
-; GFX950-NEXT: v_accvgpr_write_b32 a13, v11
-; GFX950-NEXT: v_accvgpr_write_b32 a12, v10
-; GFX950-NEXT: v_accvgpr_write_b32 a11, v9
-; GFX950-NEXT: v_accvgpr_write_b32 a10, v8
-; GFX950-NEXT: v_accvgpr_write_b32 a9, v7
-; GFX950-NEXT: v_accvgpr_write_b32 a8, v6
-; GFX950-NEXT: v_accvgpr_write_b32 a7, v5
-; GFX950-NEXT: v_accvgpr_write_b32 a6, v4
-; GFX950-NEXT: v_accvgpr_write_b32 a5, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a4, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a3, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a34
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
; GFX950-NEXT: buffer_wbl2 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
+; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a2
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a3
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a4
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a5
-; GFX950-NEXT: v_accvgpr_read_b32 v4, a6
-; GFX950-NEXT: v_accvgpr_read_b32 v5, a7
-; GFX950-NEXT: v_accvgpr_read_b32 v6, a8
-; GFX950-NEXT: v_accvgpr_read_b32 v7, a9
-; GFX950-NEXT: v_accvgpr_read_b32 v8, a10
-; GFX950-NEXT: v_accvgpr_read_b32 v9, a11
-; GFX950-NEXT: v_accvgpr_read_b32 v10, a12
-; GFX950-NEXT: v_accvgpr_read_b32 v11, a13
-; GFX950-NEXT: v_accvgpr_read_b32 v12, a14
-; GFX950-NEXT: v_accvgpr_read_b32 v13, a15
-; GFX950-NEXT: v_accvgpr_read_b32 v14, a16
-; GFX950-NEXT: v_accvgpr_read_b32 v15, a17
-; GFX950-NEXT: v_accvgpr_read_b32 v16, a18
-; GFX950-NEXT: v_accvgpr_read_b32 v17, a19
-; GFX950-NEXT: v_accvgpr_read_b32 v18, a20
-; GFX950-NEXT: v_accvgpr_read_b32 v19, a21
-; GFX950-NEXT: v_accvgpr_read_b32 v20, a22
-; GFX950-NEXT: v_accvgpr_read_b32 v21, a23
-; GFX950-NEXT: v_accvgpr_read_b32 v22, a24
-; GFX950-NEXT: v_accvgpr_read_b32 v23, a25
-; GFX950-NEXT: v_accvgpr_read_b32 v24, a26
-; GFX950-NEXT: v_accvgpr_read_b32 v25, a27
-; GFX950-NEXT: v_accvgpr_read_b32 v26, a28
-; GFX950-NEXT: v_accvgpr_read_b32 v27, a29
-; GFX950-NEXT: v_accvgpr_read_b32 v28, a30
-; GFX950-NEXT: v_accvgpr_read_b32 v29, a31
-; GFX950-NEXT: v_accvgpr_read_b32 v30, a32
-; GFX950-NEXT: v_accvgpr_read_b32 v31, a33
-; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[0:31]
-; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload
+; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload
+; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload
+; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload
+; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
+; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload
+; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; use v[0:31]
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
@@ -1101,12 +1062,12 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1131,12 +1092,12 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB21_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -1236,12 +1197,12 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1265,12 +1226,12 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB23_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -1433,12 +1394,12 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -1462,12 +1423,12 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB26_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -2046,14 +2007,14 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB32_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -2079,14 +2040,14 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB32_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -2190,14 +2151,14 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB34_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -2221,14 +2182,14 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB34_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -2395,14 +2356,14 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB37_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -2426,14 +2387,14 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB37_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -3025,262 +2986,223 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:31]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_write_b32 a33, v31
-; GFX90A-NEXT: v_accvgpr_write_b32 a32, v30
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v29
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v28
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v27
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v26
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v25
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v23
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v22
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v21
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v20
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v19
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v18
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v17
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v16
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v15
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v14
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v13
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v12
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v11
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v10
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v9
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v8
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v7
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v6
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v5
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v4
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a2
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def a34
-; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a34
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: global_atomic_xor v0, v[0:1], v2, off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a2
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a3
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a4
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a5
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a6
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a7
-; GFX90A-NEXT: v_accvgpr_read_b32 v6, a8
-; GFX90A-NEXT: v_accvgpr_read_b32 v7, a9
-; GFX90A-NEXT: v_accvgpr_read_b32 v8, a10
-; GFX90A-NEXT: v_accvgpr_read_b32 v9, a11
-; GFX90A-NEXT: v_accvgpr_read_b32 v10, a12
-; GFX90A-NEXT: v_accvgpr_read_b32 v11, a13
-; GFX90A-NEXT: v_accvgpr_read_b32 v12, a14
-; GFX90A-NEXT: v_accvgpr_read_b32 v13, a15
-; GFX90A-NEXT: v_accvgpr_read_b32 v14, a16
-; GFX90A-NEXT: v_accvgpr_read_b32 v15, a17
-; GFX90A-NEXT: v_accvgpr_read_b32 v16, a18
-; GFX90A-NEXT: v_accvgpr_read_b32 v17, a19
-; GFX90A-NEXT: v_accvgpr_read_b32 v18, a20
-; GFX90A-NEXT: v_accvgpr_read_b32 v19, a21
-; GFX90A-NEXT: v_accvgpr_read_b32 v20, a22
-; GFX90A-NEXT: v_accvgpr_read_b32 v21, a23
-; GFX90A-NEXT: v_accvgpr_read_b32 v22, a24
-; GFX90A-NEXT: v_accvgpr_read_b32 v23, a25
-; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26
-; GFX90A-NEXT: v_accvgpr_read_b32 v25, a27
-; GFX90A-NEXT: v_accvgpr_read_b32 v26, a28
-; GFX90A-NEXT: v_accvgpr_read_b32 v27, a29
-; GFX90A-NEXT: v_accvgpr_read_b32 v28, a30
-; GFX90A-NEXT: v_accvgpr_read_b32 v29, a31
-; GFX90A-NEXT: v_accvgpr_read_b32 v30, a32
-; GFX90A-NEXT: v_accvgpr_read_b32 v31, a33
-; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; use v[0:31]
-; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use v[0:31]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
+; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: global_atomic_xor_i32_ret_av_av_no_agprs:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill
-; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill
+; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[0:31]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill
+; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill
+; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def a34
+; GFX950-NEXT: ; def a2
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_accvgpr_write_b32 a33, v31
-; GFX950-NEXT: v_accvgpr_write_b32 a32, v30
-; GFX950-NEXT: v_accvgpr_write_b32 a31, v29
-; GFX950-NEXT: v_accvgpr_write_b32 a30, v28
-; GFX950-NEXT: v_accvgpr_write_b32 a29, v27
-; GFX950-NEXT: v_accvgpr_write_b32 a28, v26
-; GFX950-NEXT: v_accvgpr_write_b32 a27, v25
-; GFX950-NEXT: v_accvgpr_write_b32 a26, v24
-; GFX950-NEXT: v_accvgpr_write_b32 a25, v23
-; GFX950-NEXT: v_accvgpr_write_b32 a24, v22
-; GFX950-NEXT: v_accvgpr_write_b32 a23, v21
-; GFX950-NEXT: v_accvgpr_write_b32 a22, v20
-; GFX950-NEXT: v_accvgpr_write_b32 a21, v19
-; GFX950-NEXT: v_accvgpr_write_b32 a20, v18
-; GFX950-NEXT: v_accvgpr_write_b32 a19, v17
-; GFX950-NEXT: v_accvgpr_write_b32 a18, v16
-; GFX950-NEXT: v_accvgpr_write_b32 a17, v15
-; GFX950-NEXT: v_accvgpr_write_b32 a16, v14
-; GFX950-NEXT: v_accvgpr_write_b32 a15, v13
-; GFX950-NEXT: v_accvgpr_write_b32 a14, v12
-; GFX950-NEXT: v_accvgpr_write_b32 a13, v11
-; GFX950-NEXT: v_accvgpr_write_b32 a12, v10
-; GFX950-NEXT: v_accvgpr_write_b32 a11, v9
-; GFX950-NEXT: v_accvgpr_write_b32 a10, v8
-; GFX950-NEXT: v_accvgpr_write_b32 a9, v7
-; GFX950-NEXT: v_accvgpr_write_b32 a8, v6
-; GFX950-NEXT: v_accvgpr_write_b32 a7, v5
-; GFX950-NEXT: v_accvgpr_write_b32 a6, v4
-; GFX950-NEXT: v_accvgpr_write_b32 a5, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a4, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a3, v1
-; GFX950-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill
; GFX950-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a34
+; GFX950-NEXT: v_accvgpr_read_b32 v2, a2
; GFX950-NEXT: buffer_wbl2 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: buffer_inv sc1
+; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_read_b32 v0, a2
-; GFX950-NEXT: v_accvgpr_read_b32 v1, a3
-; GFX950-NEXT: v_accvgpr_read_b32 v2, a4
-; GFX950-NEXT: v_accvgpr_read_b32 v3, a5
-; GFX950-NEXT: v_accvgpr_read_b32 v4, a6
-; GFX950-NEXT: v_accvgpr_read_b32 v5, a7
-; GFX950-NEXT: v_accvgpr_read_b32 v6, a8
-; GFX950-NEXT: v_accvgpr_read_b32 v7, a9
-; GFX950-NEXT: v_accvgpr_read_b32 v8, a10
-; GFX950-NEXT: v_accvgpr_read_b32 v9, a11
-; GFX950-NEXT: v_accvgpr_read_b32 v10, a12
-; GFX950-NEXT: v_accvgpr_read_b32 v11, a13
-; GFX950-NEXT: v_accvgpr_read_b32 v12, a14
-; GFX950-NEXT: v_accvgpr_read_b32 v13, a15
-; GFX950-NEXT: v_accvgpr_read_b32 v14, a16
-; GFX950-NEXT: v_accvgpr_read_b32 v15, a17
-; GFX950-NEXT: v_accvgpr_read_b32 v16, a18
-; GFX950-NEXT: v_accvgpr_read_b32 v17, a19
-; GFX950-NEXT: v_accvgpr_read_b32 v18, a20
-; GFX950-NEXT: v_accvgpr_read_b32 v19, a21
-; GFX950-NEXT: v_accvgpr_read_b32 v20, a22
-; GFX950-NEXT: v_accvgpr_read_b32 v21, a23
-; GFX950-NEXT: v_accvgpr_read_b32 v22, a24
-; GFX950-NEXT: v_accvgpr_read_b32 v23, a25
-; GFX950-NEXT: v_accvgpr_read_b32 v24, a26
-; GFX950-NEXT: v_accvgpr_read_b32 v25, a27
-; GFX950-NEXT: v_accvgpr_read_b32 v26, a28
-; GFX950-NEXT: v_accvgpr_read_b32 v27, a29
-; GFX950-NEXT: v_accvgpr_read_b32 v28, a30
-; GFX950-NEXT: v_accvgpr_read_b32 v29, a31
-; GFX950-NEXT: v_accvgpr_read_b32 v30, a32
-; GFX950-NEXT: v_accvgpr_read_b32 v31, a33
-; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[0:31]
-; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload
-; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload
+; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload
+; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload
+; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload
+; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload
+; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
+; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload
+; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; GFX950-NEXT: ;;#ASMSTART
+; GFX950-NEXT: ; use v[0:31]
+; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
; GFX950-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
%data = call i32 asm "; def $0", "=^VA"()
@@ -3971,13 +3893,13 @@ define void @global_atomic_nand_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB69_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -4000,12 +3922,12 @@ define void @global_atomic_nand_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB69_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -4593,13 +4515,13 @@ define void @global_atomic_usub_cond_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB85_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -4624,13 +4546,13 @@ define void @global_atomic_usub_cond_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB85_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -4724,12 +4646,12 @@ define void @global_atomic_usub_sat_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB87_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -4752,12 +4674,12 @@ define void @global_atomic_usub_sat_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB87_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -5078,14 +5000,14 @@ define void @global_atomic_nand_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB95_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -5111,14 +5033,14 @@ define void @global_atomic_nand_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB95_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -5742,14 +5664,14 @@ define void @global_atomic_usub_cond_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB111_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -5778,14 +5700,14 @@ define void @global_atomic_usub_cond_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB111_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -5888,14 +5810,14 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB113_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -5924,14 +5846,14 @@ define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB113_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -6105,12 +6027,12 @@ define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB117_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -6133,12 +6055,12 @@ define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB117_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6228,13 +6150,13 @@ define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB119_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -6258,13 +6180,13 @@ define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB119_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6358,13 +6280,13 @@ define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB121_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -6388,13 +6310,13 @@ define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB121_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6489,13 +6411,13 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB123_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -6518,12 +6440,12 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB123_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6617,13 +6539,13 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB125_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -6646,12 +6568,12 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB125_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -6822,14 +6744,14 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB129_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -6852,14 +6774,14 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB129_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -7102,14 +7024,14 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB135_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -7137,14 +7059,14 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB135_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -7246,14 +7168,14 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB137_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -7281,14 +7203,14 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB137_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -7461,12 +7383,12 @@ define void @global_atomic_fsub_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB141_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -7489,12 +7411,12 @@ define void @global_atomic_fsub_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB141_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -7584,13 +7506,13 @@ define void @global_atomic_fmax_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB143_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -7615,13 +7537,13 @@ define void @global_atomic_fmax_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB143_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -7716,13 +7638,13 @@ define void @global_atomic_fmin_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB145_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -7747,13 +7669,13 @@ define void @global_atomic_fmin_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB145_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -7853,13 +7775,13 @@ define void @global_atomic_fmaximum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB147_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -7882,12 +7804,12 @@ define void @global_atomic_fmaximum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB147_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -7989,13 +7911,13 @@ define void @global_atomic_fminimum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB149_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8018,12 +7940,12 @@ define void @global_atomic_fminimum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB149_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -8140,13 +8062,13 @@ define void @global_atomic_fadd_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB151_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8273,13 +8195,13 @@ define void @global_atomic_fsub_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB153_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8307,13 +8229,13 @@ define void @global_atomic_fsub_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB153_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -8443,13 +8365,13 @@ define void @global_atomic_fmax_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB155_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8477,13 +8399,13 @@ define void @global_atomic_fmax_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB155_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -8613,13 +8535,13 @@ define void @global_atomic_fmin_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB157_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8647,13 +8569,13 @@ define void @global_atomic_fmin_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB157_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -8788,13 +8710,13 @@ define void @global_atomic_fmaximum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB159_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -8822,13 +8744,13 @@ define void @global_atomic_fmaximum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB159_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -8968,13 +8890,13 @@ define void @global_atomic_fminimum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB161_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9002,13 +8924,13 @@ define void @global_atomic_fminimum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB161_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -9431,13 +9353,13 @@ define void @global_atomic_nand_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB171_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9461,12 +9383,12 @@ define void @global_atomic_nand_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB171_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -10160,13 +10082,13 @@ define void @global_atomic_usub_cond_i32_saddr_ret_a_a(ptr addrspace(1) inreg %p
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB189_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -10192,13 +10114,13 @@ define void @global_atomic_usub_cond_i32_saddr_ret_a_a(ptr addrspace(1) inreg %p
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB189_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -10295,12 +10217,12 @@ define void @global_atomic_usub_sat_i32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB191_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -10324,12 +10246,12 @@ define void @global_atomic_usub_sat_i32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB191_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -10740,14 +10662,14 @@ define void @global_atomic_nand_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB201_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -10774,14 +10696,14 @@ define void @global_atomic_nand_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB201_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -11507,14 +11429,14 @@ define void @global_atomic_usub_cond_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB219_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -11544,14 +11466,14 @@ define void @global_atomic_usub_cond_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB219_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -11657,14 +11579,14 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB221_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -11694,14 +11616,14 @@ define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB221_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -11882,12 +11804,12 @@ define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB225_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -11911,12 +11833,12 @@ define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB225_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12009,13 +11931,13 @@ define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB227_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12040,13 +11962,13 @@ define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB227_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12143,13 +12065,13 @@ define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB229_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12174,13 +12096,13 @@ define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB229_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12278,13 +12200,13 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB231_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12308,12 +12230,12 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB231_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12410,13 +12332,13 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB233_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -12440,12 +12362,12 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB233_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -12622,14 +12544,14 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB237_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -12653,14 +12575,14 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB237_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -12912,14 +12834,14 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB243_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -12948,14 +12870,14 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB243_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -13060,14 +12982,14 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB245_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a[0:1]
; GFX90A-NEXT: ;;#ASMEND
@@ -13096,14 +13018,14 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB245_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -13283,12 +13205,12 @@ define void @global_atomic_fsub_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB249_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13312,12 +13234,12 @@ define void @global_atomic_fsub_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB249_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13410,13 +13332,13 @@ define void @global_atomic_fmax_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB251_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13442,13 +13364,13 @@ define void @global_atomic_fmax_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB251_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13546,13 +13468,13 @@ define void @global_atomic_fmin_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB253_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13578,13 +13500,13 @@ define void @global_atomic_fmin_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB253_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13687,13 +13609,13 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB255_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13717,12 +13639,12 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB255_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13827,13 +13749,13 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB257_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -13857,12 +13779,12 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB257_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -13982,13 +13904,13 @@ define void @global_atomic_fadd_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB259_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14119,13 +14041,13 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB261_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14154,13 +14076,13 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB261_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14293,13 +14215,13 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB263_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14328,13 +14250,13 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB263_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14467,13 +14389,13 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB265_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14502,13 +14424,13 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB265_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14646,13 +14568,13 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB267_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14681,13 +14603,13 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB267_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -14830,13 +14752,13 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg
; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execnz .LBB269_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -14865,13 +14787,13 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg
; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX950-NEXT: v_mov_b32_e32 v1, v0
; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX950-NEXT: s_cbranch_execnz .LBB269_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index ebbeab94066d6..9e240238c1066 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -146,9 +146,9 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v39, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a2
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v39
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v32
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use a3 v[0:31]
; GFX908-NEXT: ;;#ASMEND
@@ -437,9 +437,9 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 {
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_nop 7
-; GFX908-NEXT: v_accvgpr_read_b32 v35, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a2
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v35
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v33
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use a3 v[0:31]
; GFX908-NEXT: ;;#ASMEND
@@ -1045,9 +1045,9 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; copy
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_read_b32 v39, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a2
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v39
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v32
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use a3 v[0:31]
; GFX908-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
index 0c5fd1fc0932a..63b7b70548baf 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
@@ -180,63 +180,55 @@ define amdgpu_kernel void @test_call_empty() #0 {
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def a[0:31]
; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v34, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a28
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_endpgm
bb:
@@ -321,65 +313,57 @@ define amdgpu_kernel void @test_call_areg4() #0 {
; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23]
; GFX908-NEXT: s_mov_b32 s32, 0
; GFX908-NEXT: ;;#ASMSTART
-; GFX908-NEXT: ; def a[4:35]
+; GFX908-NEXT: ; def a[0:31]
; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v34, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a28
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a32
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a33
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a34
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a35
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_endpgm
bb:
@@ -464,65 +448,57 @@ define amdgpu_kernel void @test_call_areg32() #0 {
; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23]
; GFX908-NEXT: s_mov_b32 s32, 0
; GFX908-NEXT: ;;#ASMSTART
-; GFX908-NEXT: ; def a[32:63]
+; GFX908-NEXT: ; def a[0:31]
; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v34, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a28
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a60
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a61
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a62
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a63
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a56
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a57
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a58
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a59
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a52
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a53
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a54
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a55
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a48
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a49
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a50
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a51
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a44
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a45
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a46
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a47
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a40
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a41
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a42
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a43
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a36
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a37
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a38
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a39
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a32
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a33
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a34
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a35
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_endpgm
bb:
@@ -609,63 +585,55 @@ define amdgpu_kernel void @test_call_areg64() #0 {
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def a[0:31]
; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v34, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a28
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_endpgm
bb:
@@ -750,65 +718,57 @@ define amdgpu_kernel void @test_call_areg31_63() #0 {
; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23]
; GFX908-NEXT: s_mov_b32 s32, 0
; GFX908-NEXT: ;;#ASMSTART
-; GFX908-NEXT: ; def a[64:95]
+; GFX908-NEXT: ; def a[0:31]
; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v35, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v34, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v32, a28
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a92
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a93
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a94
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a95
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a88
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a89
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a90
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a91
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a84
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a85
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a86
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a87
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a80
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a81
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a82
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a83
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a76
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a77
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a78
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a79
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a72
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a73
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a74
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a75
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a68
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a69
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a70
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a71
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a64
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a65
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a66
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a67
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[32:35], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[27:30], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[23:26], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[19:22], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_endpgm
bb:
@@ -889,125 +849,61 @@ define amdgpu_kernel void @test_call_unknown() #0 {
; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX908-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX908-NEXT: ;;#ASMSTART
-; GFX908-NEXT: ; def a[0:31]
-; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX908-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX908-NEXT: s_mov_b32 s32, 0
-; GFX908-NEXT: v_accvgpr_read_b32 v95, a0 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v94, a1 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v93, a2 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v92, a3 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v91, a4 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v90, a5 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v89, a6 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v88, a7 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v79, a8 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v78, a9 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v77, a10 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v76, a11 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v75, a12 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v74, a13 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v73, a14 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v72, a15 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v63, a16 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v61, a18 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v60, a19 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v59, a20 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v58, a21 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v57, a22 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v56, a23 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v47, a24 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v46, a25 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v45, a26 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v44, a27 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v43, a28 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v42, a29 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v41, a30 ; Reload Reuse
-; GFX908-NEXT: v_accvgpr_read_b32 v40, a31 ; Reload Reuse
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; def a[0:31]
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_read_b32 v43, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v42, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v41, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v40, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v47, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v46, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v45, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v44, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v59, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v58, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v57, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v56, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v63, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v62, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v61, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v60, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v75, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v74, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v73, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v72, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v79, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v78, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v77, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v76, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v91, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v90, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v89, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v88, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v95, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v94, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v93, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v92, a28
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX908-NEXT: v_mov_b32_e32 v4, v95
-; GFX908-NEXT: v_mov_b32_e32 v5, v94
-; GFX908-NEXT: v_mov_b32_e32 v6, v93
-; GFX908-NEXT: v_mov_b32_e32 v7, v92
-; GFX908-NEXT: v_mov_b32_e32 v8, v91
-; GFX908-NEXT: v_mov_b32_e32 v9, v90
-; GFX908-NEXT: v_mov_b32_e32 v10, v89
-; GFX908-NEXT: v_mov_b32_e32 v11, v88
-; GFX908-NEXT: v_mov_b32_e32 v12, v79
-; GFX908-NEXT: v_mov_b32_e32 v13, v78
-; GFX908-NEXT: v_mov_b32_e32 v14, v77
-; GFX908-NEXT: v_mov_b32_e32 v15, v76
-; GFX908-NEXT: v_mov_b32_e32 v16, v75
-; GFX908-NEXT: v_mov_b32_e32 v17, v74
-; GFX908-NEXT: v_mov_b32_e32 v18, v73
-; GFX908-NEXT: v_mov_b32_e32 v19, v72
-; GFX908-NEXT: v_mov_b32_e32 v20, v63
-; GFX908-NEXT: v_mov_b32_e32 v21, v62
-; GFX908-NEXT: v_mov_b32_e32 v22, v61
-; GFX908-NEXT: v_mov_b32_e32 v23, v60
-; GFX908-NEXT: v_mov_b32_e32 v24, v59
-; GFX908-NEXT: v_mov_b32_e32 v25, v58
-; GFX908-NEXT: v_mov_b32_e32 v26, v57
-; GFX908-NEXT: v_mov_b32_e32 v27, v56
-; GFX908-NEXT: v_mov_b32_e32 v28, v47
-; GFX908-NEXT: v_mov_b32_e32 v29, v46
-; GFX908-NEXT: v_mov_b32_e32 v30, v45
-; GFX908-NEXT: v_mov_b32_e32 v31, v44
-; GFX908-NEXT: v_mov_b32_e32 v32, v43
-; GFX908-NEXT: v_mov_b32_e32 v33, v42
-; GFX908-NEXT: v_mov_b32_e32 v34, v41
-; GFX908-NEXT: v_mov_b32_e32 v35, v40
-; GFX908-NEXT: v_mov_b32_e32 v0, v32
-; GFX908-NEXT: v_mov_b32_e32 v1, v33
-; GFX908-NEXT: v_mov_b32_e32 v2, v34
-; GFX908-NEXT: v_mov_b32_e32 v3, v35
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v0, v28
-; GFX908-NEXT: v_mov_b32_e32 v1, v29
-; GFX908-NEXT: v_mov_b32_e32 v2, v30
-; GFX908-NEXT: v_mov_b32_e32 v3, v31
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v0, v24
-; GFX908-NEXT: v_mov_b32_e32 v1, v25
-; GFX908-NEXT: v_mov_b32_e32 v2, v26
-; GFX908-NEXT: v_mov_b32_e32 v3, v27
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v0, v20
-; GFX908-NEXT: v_mov_b32_e32 v1, v21
-; GFX908-NEXT: v_mov_b32_e32 v2, v22
-; GFX908-NEXT: v_mov_b32_e32 v3, v23
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v0, v16
-; GFX908-NEXT: v_mov_b32_e32 v1, v17
-; GFX908-NEXT: v_mov_b32_e32 v2, v18
-; GFX908-NEXT: v_mov_b32_e32 v3, v19
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v0, v12
-; GFX908-NEXT: v_mov_b32_e32 v1, v13
-; GFX908-NEXT: v_mov_b32_e32 v2, v14
-; GFX908-NEXT: v_mov_b32_e32 v3, v15
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v0, v8
-; GFX908-NEXT: v_mov_b32_e32 v1, v9
-; GFX908-NEXT: v_mov_b32_e32 v2, v10
-; GFX908-NEXT: v_mov_b32_e32 v3, v11
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: v_mov_b32_e32 v2, v6
-; GFX908-NEXT: v_mov_b32_e32 v3, v7
-; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[92:95], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[88:91], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[76:79], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[72:75], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[60:63], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[56:59], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[44:47], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[40:43], off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
index 1180fc7b35a0b..1a2dd6e5f90f6 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
@@ -29,17 +29,17 @@ define void @remat_regcopy_avoids_spill(i32 %v0, i32 %v1, i32 %v2, i32 %v3, i32
; GFX908-LABEL: remat_regcopy_avoids_spill:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
-; GFX908-NEXT: v_accvgpr_write_b32 a1, v5
-; GFX908-NEXT: v_accvgpr_write_b32 a2, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v1
; GFX908-NEXT: v_accvgpr_write_b32 a4, v2
-; GFX908-NEXT: v_accvgpr_write_b32 a5, v1
-; GFX908-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX908-NEXT: v_accvgpr_write_b32 a7, v7
-; GFX908-NEXT: v_accvgpr_write_b32 a0, v8
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v7
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v8
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v3
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v6
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v6
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 2cbf39e2464bc..c3b14e8829042 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -423,8 +423,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, v5
-; GFX942-NEXT: buffer_atomic_add_f32 v6, v4, s[4:7], 0 offen offset:1024 sc0
+; GFX942-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], 0 offen offset:1024 sc0
; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX942-NEXT: ; implicit-def: $vgpr4
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
@@ -432,7 +431,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v0, v5
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
@@ -542,8 +541,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, v5
-; GFX90A-NEXT: buffer_atomic_add_f32 v6, v4, s[8:11], 0 offen offset:1024 glc
+; GFX90A-NEXT: buffer_atomic_add_f32 v5, v4, s[8:11], 0 offen offset:1024 glc
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -551,7 +549,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2443,8 +2441,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v9, v6
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v7, v6
+; GFX942-NEXT: v_mov_b32_e32 v6, v5
; GFX942-NEXT: s_mov_b64 s[2:3], exec
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
@@ -2458,7 +2456,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
; GFX942-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX942-NEXT: ; implicit-def: $vgpr4
@@ -2610,8 +2607,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v9, v6
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v7, v6
+; GFX90A-NEXT: v_mov_b32_e32 v6, v5
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -2623,7 +2620,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4
@@ -4489,6 +4485,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: v_lshlrev_b32_e32 v6, v4, v6
; GFX942-NEXT: v_and_or_b32 v6, v7, v11, v6
; GFX942-NEXT: s_mov_b64 s[8:9], exec
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
@@ -4502,7 +4499,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB15_4
@@ -4778,6 +4774,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6
; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -4789,7 +4786,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_4
@@ -6352,6 +6348,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -6364,7 +6361,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_4
@@ -6678,6 +6674,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -6689,7 +6686,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_4
@@ -7532,8 +7528,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, v5
-; GFX942-NEXT: buffer_atomic_pk_add_f16 v6, v4, s[4:7], 0 offen offset:1024 sc0
+; GFX942-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0
; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX942-NEXT: ; implicit-def: $vgpr4
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
@@ -7541,7 +7536,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v0, v5
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
@@ -7687,8 +7682,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, v5
-; GFX90A-NEXT: buffer_atomic_pk_add_f16 v6, v4, s[8:11], 0 offen offset:1024 glc
+; GFX90A-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -7696,7 +7690,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -9981,6 +9975,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -9993,7 +9988,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB28_4
@@ -10307,6 +10301,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -10318,7 +10313,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB28_4
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index 187c8c9c11fa3..f7a1fb35c8106 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -429,6 +429,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX942-NEXT: v_max_f32_e32 v4, v7, v7
; GFX942-NEXT: v_max_f32_e32 v6, v4, v9
; GFX942-NEXT: s_mov_b64 s[8:9], exec
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
@@ -442,7 +443,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB2_4
@@ -549,6 +549,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7
; GFX90A-NEXT: v_max_f32_e32 v6, v4, v9
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -560,7 +561,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_4
@@ -1653,8 +1653,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v9, v6
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v7, v6
+; GFX942-NEXT: v_mov_b32_e32 v6, v5
; GFX942-NEXT: s_mov_b64 s[2:3], exec
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
@@ -1668,7 +1668,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
; GFX942-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX942-NEXT: ; implicit-def: $vgpr4
@@ -1784,8 +1783,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v9, v6
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v7, v6
+; GFX90A-NEXT: v_mov_b32_e32 v6, v5
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -1797,7 +1796,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4
@@ -3605,6 +3603,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v8, v4
; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX942-NEXT: s_mov_b64 s[8:9], exec
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
@@ -3618,7 +3617,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB12_4
@@ -3904,6 +3902,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4
; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -3915,7 +3914,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_4
@@ -5486,6 +5484,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -5498,7 +5497,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB15_4
@@ -5812,6 +5810,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -5823,7 +5822,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_4
@@ -6878,6 +6876,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX942-NEXT: s_mov_b64 s[8:9], exec
; GFX942-NEXT: v_pk_max_f16 v6, v4, v9
; GFX942-NEXT: buffer_wbl2 sc1
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -6890,7 +6889,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_4
@@ -7070,6 +7068,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7
; GFX90A-NEXT: v_pk_max_f16 v6, v4, v9
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -7081,7 +7080,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_4
@@ -8667,6 +8665,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -8679,7 +8678,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB21_4
@@ -8993,6 +8991,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -9004,7 +9003,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_4
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index acbea3921b616..8ac6353133e72 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -429,6 +429,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX942-NEXT: v_max_f32_e32 v4, v7, v7
; GFX942-NEXT: v_min_f32_e32 v6, v4, v9
; GFX942-NEXT: s_mov_b64 s[8:9], exec
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
@@ -442,7 +443,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB2_4
@@ -549,6 +549,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7
; GFX90A-NEXT: v_min_f32_e32 v6, v4, v9
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -560,7 +561,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB2_4
@@ -1653,8 +1653,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v9, v6
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: v_mov_b32_e32 v7, v6
+; GFX942-NEXT: v_mov_b32_e32 v6, v5
; GFX942-NEXT: s_mov_b64 s[2:3], exec
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
@@ -1668,7 +1668,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
; GFX942-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX942-NEXT: ; implicit-def: $vgpr4
@@ -1784,8 +1783,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v9, v6
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v7, v6
+; GFX90A-NEXT: v_mov_b32_e32 v6, v5
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -1797,7 +1796,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX90A-NEXT: ; implicit-def: $vgpr4
@@ -3605,6 +3603,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: v_lshlrev_b32_e32 v4, v8, v4
; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX942-NEXT: s_mov_b64 s[8:9], exec
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
@@ -3618,7 +3617,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB12_4
@@ -3904,6 +3902,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4
; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -3915,7 +3914,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_4
@@ -5486,6 +5484,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX942-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX942-NEXT: v_and_or_b32 v6, v7, v10, v4
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -5498,7 +5497,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB15_4
@@ -5812,6 +5810,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -5823,7 +5822,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB15_4
@@ -6878,6 +6876,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX942-NEXT: s_mov_b64 s[8:9], exec
; GFX942-NEXT: v_pk_min_f16 v6, v4, v9
; GFX942-NEXT: buffer_wbl2 sc1
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -6890,7 +6889,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB18_4
@@ -7070,6 +7068,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7
; GFX90A-NEXT: v_pk_min_f16 v6, v4, v9
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -7081,7 +7080,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB18_4
@@ -8667,6 +8665,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
; GFX942-NEXT: v_perm_b32 v6, v5, v4, s11
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: v_readfirstlane_b32 s4, v0
@@ -8679,7 +8678,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX942-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB21_4
@@ -8993,6 +8991,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -9004,7 +9003,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB21_4
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index 0199e2866b35d..3c991cfb7a1aa 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -258,59 +258,68 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
; SDAG-GFX942-NEXT: .LBB0_1: ; %load-store-loop
; SDAG-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; SDAG-GFX942-NEXT: s_add_i32 s1, s0, s16
-; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[4:7], 0 offen
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v0, s[4:7], 0 offen offset:16
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v0, s[4:7], 0 offen offset:32
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v0, s[4:7], 0 offen offset:48
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v0, s[4:7], 0 offen offset:64
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v0, s[4:7], 0 offen offset:80
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v0, s[4:7], 0 offen offset:96
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v0, s[4:7], 0 offen offset:112
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v0, s[4:7], 0 offen offset:128
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v0, s[4:7], 0 offen offset:144
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v0, s[4:7], 0 offen offset:160
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v0, s[4:7], 0 offen offset:176
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v0, s[4:7], 0 offen offset:192
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v0, s[4:7], 0 offen offset:208
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v0, s[4:7], 0 offen offset:224
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v0, s[4:7], 0 offen offset:240
-; SDAG-GFX942-NEXT: s_add_i32 s1, s8, s16
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v60, s1
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32
+; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s2
; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100
-; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1
; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x2000
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
+; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v15 ; Reload Reuse
+; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v14 ; Reload Reuse
+; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v13 ; Reload Reuse
+; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v12 ; Reload Reuse
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:48
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v60, s[4:7], 0 offen offset:64
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v60, s[4:7], 0 offen offset:80
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v60, s[4:7], 0 offen offset:96
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v60, s[4:7], 0 offen offset:112
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v60, s[4:7], 0 offen offset:128
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v60, s[4:7], 0 offen offset:144
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v60, s[4:7], 0 offen offset:160
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v60, s[4:7], 0 offen offset:176
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v60, s[4:7], 0 offen offset:192
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v60, s[4:7], 0 offen offset:208
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224
+; SDAG-GFX942-NEXT: s_nop 0
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240
+; SDAG-GFX942-NEXT: s_nop 0
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16
+; SDAG-GFX942-NEXT: s_nop 1
+; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
+; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
+; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
+; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen offset:32
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v0, s[12:15], 0 offen offset:16
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v0, s[12:15], 0 offen offset:32
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v0, s[12:15], 0 offen offset:48
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v0, s[12:15], 0 offen offset:48
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v0, s[12:15], 0 offen offset:64
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v0, s[12:15], 0 offen offset:64
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v0, s[12:15], 0 offen offset:80
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v0, s[12:15], 0 offen offset:80
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v0, s[12:15], 0 offen offset:96
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v0, s[12:15], 0 offen offset:96
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v0, s[12:15], 0 offen offset:112
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v0, s[12:15], 0 offen offset:112
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v0, s[12:15], 0 offen offset:128
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v0, s[12:15], 0 offen offset:128
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v0, s[12:15], 0 offen offset:144
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v0, s[12:15], 0 offen offset:144
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v0, s[12:15], 0 offen offset:160
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v0, s[12:15], 0 offen offset:160
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v0, s[12:15], 0 offen offset:176
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v0, s[12:15], 0 offen offset:176
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v0, s[12:15], 0 offen offset:192
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v0, s[12:15], 0 offen offset:192
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v0, s[12:15], 0 offen offset:208
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v0, s[12:15], 0 offen offset:208
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v0, s[12:15], 0 offen offset:224
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v0, s[12:15], 0 offen offset:224
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v0, s[12:15], 0 offen offset:240
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v0, s[12:15], 0 offen offset:240
; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB0_1
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
; SDAG-GFX942-NEXT: s_endpgm
@@ -431,58 +440,46 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16
; GISEL-GFX942-NEXT: .LBB0_1: ; %load-store-loop
; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GISEL-GFX942-NEXT: v_add_u32_e32 v2, s0, v1
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[4:7], v2, s[8:11], 0 offen
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[8:11], v2, s[8:11], 0 offen offset:16
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[12:15], v2, s[8:11], 0 offen offset:32
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[16:19], v2, s[8:11], 0 offen offset:48
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[20:23], v2, s[8:11], 0 offen offset:64
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[24:27], v2, s[8:11], 0 offen offset:80
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[28:31], v2, s[8:11], 0 offen offset:96
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[32:35], v2, s[8:11], 0 offen offset:112
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[36:39], v2, s[8:11], 0 offen offset:128
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[40:43], v2, s[8:11], 0 offen offset:144
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[44:47], v2, s[8:11], 0 offen offset:160
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[48:51], v2, s[8:11], 0 offen offset:176
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[52:55], v2, s[8:11], 0 offen offset:192
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[56:59], v2, s[8:11], 0 offen offset:208
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[60:63], v2, s[8:11], 0 offen offset:224
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v2, s[8:11], 0 offen offset:240
-; GISEL-GFX942-NEXT: v_add_u32_e32 v2, s12, v1
+; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v62, s[8:11], 0 offen offset:96
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v62, s[8:11], 0 offen offset:112
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v62, s[8:11], 0 offen offset:128
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v62, s[8:11], 0 offen offset:144
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v62, s[8:11], 0 offen offset:160
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v62, s[8:11], 0 offen offset:176
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
+; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[4:7], v2, s[4:7], 0 offen
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[8:11], v2, s[4:7], 0 offen offset:16
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[12:15], v2, s[4:7], 0 offen offset:32
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[16:19], v2, s[4:7], 0 offen offset:48
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[20:23], v2, s[4:7], 0 offen offset:64
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[24:27], v2, s[4:7], 0 offen offset:80
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[28:31], v2, s[4:7], 0 offen offset:96
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[32:35], v2, s[4:7], 0 offen offset:112
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[36:39], v2, s[4:7], 0 offen offset:128
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[40:43], v2, s[4:7], 0 offen offset:144
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[44:47], v2, s[4:7], 0 offen offset:160
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[48:51], v2, s[4:7], 0 offen offset:176
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[52:55], v2, s[4:7], 0 offen offset:192
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[56:59], v2, s[4:7], 0 offen offset:208
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[60:63], v2, s[4:7], 0 offen offset:224
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v2, s[4:7], 0 offen offset:240
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
+; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
+; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB0_1
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
; GISEL-GFX942-NEXT: s_endpgm
@@ -823,41 +820,30 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208
; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224
; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240
-; SDAG-GFX942-NEXT: v_add_u32_e32 v1, s8, v0
+; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0
; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[12:15], 0 offen
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v1, s[12:15], 0 offen offset:16
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v1, s[12:15], 0 offen offset:32
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v1, s[12:15], 0 offen offset:48
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v1, s[12:15], 0 offen offset:64
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v1, s[12:15], 0 offen offset:80
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v1, s[12:15], 0 offen offset:96
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v1, s[12:15], 0 offen offset:112
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v1, s[12:15], 0 offen offset:128
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v1, s[12:15], 0 offen offset:144
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v1, s[12:15], 0 offen offset:160
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v1, s[12:15], 0 offen offset:176
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v1, s[12:15], 0 offen offset:192
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v1, s[12:15], 0 offen offset:208
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v1, s[12:15], 0 offen offset:224
-; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v1, s[12:15], 0 offen offset:240
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
+; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
+; SDAG-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[12:15], 0 offen offset:32
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[12:15], 0 offen offset:96
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[12:15], 0 offen offset:112
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[12:15], 0 offen offset:128
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[12:15], 0 offen offset:144
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[12:15], 0 offen offset:160
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[12:15], 0 offen offset:176
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224
+; SDAG-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
+; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240
; SDAG-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split
; SDAG-GFX942-NEXT: s_endpgm
@@ -991,43 +977,32 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224
; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[8:11], 0 offen offset:240
-; GISEL-GFX942-NEXT: v_add_u32_e32 v1, s12, v0
+; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1
; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1
; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v1, s[4:7], 0 offen offset:96
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v1, s[4:7], 0 offen offset:112
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v1, s[4:7], 0 offen offset:128
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v1, s[4:7], 0 offen offset:144
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v1, s[4:7], 0 offen offset:160
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v1, s[4:7], 0 offen offset:176
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224
-; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
+; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse
+; GISEL-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[4:7], 0 offen offset:96
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[4:7], 0 offen offset:112
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[4:7], 0 offen offset:128
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[4:7], 0 offen offset:144
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[4:7], 0 offen offset:160
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[4:7], 0 offen offset:176
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224
+; GISEL-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
+; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB1_1
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
; GISEL-GFX942-NEXT: s_endpgm
@@ -1171,8 +1146,8 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; SDAG-GFX942-NEXT: s_mov_b32 s2, s1
; SDAG-GFX942-NEXT: s_mov_b32 s3, s12
; SDAG-GFX942-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13]
-; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v4, s0
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen
; SDAG-GFX942-NEXT: s_load_dword s13, s[4:5], 0x54
; SDAG-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44
; SDAG-GFX942-NEXT: s_mov_b32 s5, s12
@@ -1183,12 +1158,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; SDAG-GFX942-NEXT: s_mov_b32 s2, s1
; SDAG-GFX942-NEXT: s_mov_b32 s3, s12
; SDAG-GFX942-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; SDAG-GFX942-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX942-NEXT: v_mov_b32_e32 v5, s0
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen
-; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen offset:16
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen
+; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:16
; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen offset:16
+; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen offset:16
; SDAG-GFX942-NEXT: s_endpgm
;
; SDAG-GFX1100-LABEL: memcpy_known_small:
@@ -1242,8 +1217,8 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; GISEL-GFX942-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9]
; GISEL-GFX942-NEXT: s_mov_b32 s6, s3
; GISEL-GFX942-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
-; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen
+; GISEL-GFX942-NEXT: v_mov_b32_e32 v4, s0
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen
; GISEL-GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44
; GISEL-GFX942-NEXT: s_load_dword s13, s[4:5], 0x54
; GISEL-GFX942-NEXT: s_mov_b32 s4, s7
@@ -1254,12 +1229,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
; GISEL-GFX942-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; GISEL-GFX942-NEXT: s_mov_b32 s6, s3
; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13]
-; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s0
+; GISEL-GFX942-NEXT: v_mov_b32_e32 v5, s0
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen
-; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v0, s[8:11], 0 offen offset:16
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen
+; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:16
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen offset:16
; GISEL-GFX942-NEXT: s_endpgm
;
; GISEL-GFX1100-LABEL: memcpy_known_small:
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll
index 8b998354b1f4f..683887b0a55f3 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll
@@ -426,126 +426,122 @@ define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
; GCN-LABEL: ds_write2_b32_av_av_no_vgprs:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
; GCN-NEXT: v_accvgpr_write_b32 a0, v0
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def a1
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
-; GCN-NEXT: ; def a34
+; GCN-NEXT: ; def a2
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def v[0:31]
; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: v_accvgpr_write_b32 a33, v31
-; GCN-NEXT: v_accvgpr_write_b32 a32, v30
-; GCN-NEXT: v_accvgpr_write_b32 a31, v29
-; GCN-NEXT: v_accvgpr_write_b32 a30, v28
-; GCN-NEXT: v_accvgpr_write_b32 a29, v27
-; GCN-NEXT: v_accvgpr_write_b32 a28, v26
-; GCN-NEXT: v_accvgpr_write_b32 a27, v25
-; GCN-NEXT: v_accvgpr_write_b32 a26, v24
-; GCN-NEXT: v_accvgpr_write_b32 a25, v23
-; GCN-NEXT: v_accvgpr_write_b32 a24, v22
-; GCN-NEXT: v_accvgpr_write_b32 a23, v21
-; GCN-NEXT: v_accvgpr_write_b32 a22, v20
-; GCN-NEXT: v_accvgpr_write_b32 a21, v19
-; GCN-NEXT: v_accvgpr_write_b32 a20, v18
-; GCN-NEXT: v_accvgpr_write_b32 a19, v17
-; GCN-NEXT: v_accvgpr_write_b32 a18, v16
-; GCN-NEXT: v_accvgpr_write_b32 a17, v15
-; GCN-NEXT: v_accvgpr_write_b32 a16, v14
-; GCN-NEXT: v_accvgpr_write_b32 a15, v13
-; GCN-NEXT: v_accvgpr_write_b32 a14, v12
-; GCN-NEXT: v_accvgpr_write_b32 a13, v11
-; GCN-NEXT: v_accvgpr_write_b32 a12, v10
-; GCN-NEXT: v_accvgpr_write_b32 a11, v9
-; GCN-NEXT: v_accvgpr_write_b32 a10, v8
-; GCN-NEXT: v_accvgpr_write_b32 a9, v7
-; GCN-NEXT: v_accvgpr_write_b32 a8, v6
-; GCN-NEXT: v_accvgpr_write_b32 a7, v5
-; GCN-NEXT: v_accvgpr_write_b32 a6, v4
-; GCN-NEXT: v_accvgpr_write_b32 a5, v3
-; GCN-NEXT: v_accvgpr_write_b32 a4, v2
-; GCN-NEXT: v_accvgpr_write_b32 a3, v1
-; GCN-NEXT: v_accvgpr_write_b32 a2, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GCN-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a34
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
; GCN-NEXT: ds_write2_b32 v0, v1, v2 offset0:10 offset1:24
-; GCN-NEXT: v_accvgpr_read_b32 v0, a2
-; GCN-NEXT: v_accvgpr_read_b32 v1, a3
-; GCN-NEXT: v_accvgpr_read_b32 v2, a4
-; GCN-NEXT: v_accvgpr_read_b32 v3, a5
-; GCN-NEXT: v_accvgpr_read_b32 v4, a6
-; GCN-NEXT: v_accvgpr_read_b32 v5, a7
-; GCN-NEXT: v_accvgpr_read_b32 v6, a8
-; GCN-NEXT: v_accvgpr_read_b32 v7, a9
-; GCN-NEXT: v_accvgpr_read_b32 v8, a10
-; GCN-NEXT: v_accvgpr_read_b32 v9, a11
-; GCN-NEXT: v_accvgpr_read_b32 v10, a12
-; GCN-NEXT: v_accvgpr_read_b32 v11, a13
-; GCN-NEXT: v_accvgpr_read_b32 v12, a14
-; GCN-NEXT: v_accvgpr_read_b32 v13, a15
-; GCN-NEXT: v_accvgpr_read_b32 v14, a16
-; GCN-NEXT: v_accvgpr_read_b32 v15, a17
-; GCN-NEXT: v_accvgpr_read_b32 v16, a18
-; GCN-NEXT: v_accvgpr_read_b32 v17, a19
-; GCN-NEXT: v_accvgpr_read_b32 v18, a20
-; GCN-NEXT: v_accvgpr_read_b32 v19, a21
-; GCN-NEXT: v_accvgpr_read_b32 v20, a22
-; GCN-NEXT: v_accvgpr_read_b32 v21, a23
-; GCN-NEXT: v_accvgpr_read_b32 v22, a24
-; GCN-NEXT: v_accvgpr_read_b32 v23, a25
-; GCN-NEXT: v_accvgpr_read_b32 v24, a26
-; GCN-NEXT: v_accvgpr_read_b32 v25, a27
-; GCN-NEXT: v_accvgpr_read_b32 v26, a28
-; GCN-NEXT: v_accvgpr_read_b32 v27, a29
-; GCN-NEXT: v_accvgpr_read_b32 v28, a30
-; GCN-NEXT: v_accvgpr_read_b32 v29, a31
-; GCN-NEXT: v_accvgpr_read_b32 v30, a32
-; GCN-NEXT: v_accvgpr_read_b32 v31, a33
+; GCN-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GCN-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use v[0:31]
; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 10
%gep.1 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 24
@@ -980,133 +976,123 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_accvgpr_write_b32 a1, v40 ; Reload Reuse
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword a34, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword a35, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword a36, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword a37, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: v_accvgpr_write_b32 a6, v41 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a7, v42 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a8, v43 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a9, v44 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a10, v45 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a11, v46 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a12, v47 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a13, v56 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a14, v57 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a15, v58 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a16, v59 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a17, v60 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a18, v61 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a19, v62 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a20, v63 ; Reload Reuse
; GCN-NEXT: v_accvgpr_write_b32 a0, v0
; GCN-NEXT: ;;#ASMSTART
-; GCN-NEXT: ; def a[34:35]
+; GCN-NEXT: ; def a[2:3]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
-; GCN-NEXT: ; def a[36:37]
+; GCN-NEXT: ; def a[4:5]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def v[0:31]
; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: v_accvgpr_write_b32 a33, v31
-; GCN-NEXT: v_accvgpr_write_b32 a32, v30
-; GCN-NEXT: v_accvgpr_write_b32 a31, v29
-; GCN-NEXT: v_accvgpr_write_b32 a30, v28
-; GCN-NEXT: v_accvgpr_write_b32 a29, v27
-; GCN-NEXT: v_accvgpr_write_b32 a28, v26
-; GCN-NEXT: v_accvgpr_write_b32 a27, v25
-; GCN-NEXT: v_accvgpr_write_b32 a26, v24
-; GCN-NEXT: v_accvgpr_write_b32 a25, v23
-; GCN-NEXT: v_accvgpr_write_b32 a24, v22
-; GCN-NEXT: v_accvgpr_write_b32 a23, v21
-; GCN-NEXT: v_accvgpr_write_b32 a22, v20
-; GCN-NEXT: v_accvgpr_write_b32 a21, v19
-; GCN-NEXT: v_accvgpr_write_b32 a20, v18
-; GCN-NEXT: v_accvgpr_write_b32 a19, v17
-; GCN-NEXT: v_accvgpr_write_b32 a18, v16
-; GCN-NEXT: v_accvgpr_write_b32 a17, v15
-; GCN-NEXT: v_accvgpr_write_b32 a16, v14
-; GCN-NEXT: v_accvgpr_write_b32 a15, v13
-; GCN-NEXT: v_accvgpr_write_b32 a14, v12
-; GCN-NEXT: v_accvgpr_write_b32 a13, v11
-; GCN-NEXT: v_accvgpr_write_b32 a12, v10
-; GCN-NEXT: v_accvgpr_write_b32 a11, v9
-; GCN-NEXT: v_accvgpr_write_b32 a10, v8
-; GCN-NEXT: v_accvgpr_write_b32 a9, v7
-; GCN-NEXT: v_accvgpr_write_b32 a8, v6
-; GCN-NEXT: v_accvgpr_write_b32 a7, v5
-; GCN-NEXT: v_accvgpr_write_b32 a6, v4
-; GCN-NEXT: v_accvgpr_write_b32 a5, v3
-; GCN-NEXT: v_accvgpr_write_b32 a4, v2
-; GCN-NEXT: v_accvgpr_write_b32 a3, v1
-; GCN-NEXT: v_accvgpr_write_b32 a2, v0
-; GCN-NEXT: v_accvgpr_read_b32 v2, a34
-; GCN-NEXT: v_accvgpr_read_b32 v4, a36
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GCN-NEXT: v_accvgpr_write_b32 a21, v31 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v3, a35
-; GCN-NEXT: v_accvgpr_read_b32 v5, a37
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
; GCN-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24
-; GCN-NEXT: v_accvgpr_read_b32 v0, a2
-; GCN-NEXT: v_accvgpr_read_b32 v1, a3
-; GCN-NEXT: v_accvgpr_read_b32 v2, a4
-; GCN-NEXT: v_accvgpr_read_b32 v3, a5
-; GCN-NEXT: v_accvgpr_read_b32 v4, a6
-; GCN-NEXT: v_accvgpr_read_b32 v5, a7
-; GCN-NEXT: v_accvgpr_read_b32 v6, a8
-; GCN-NEXT: v_accvgpr_read_b32 v7, a9
-; GCN-NEXT: v_accvgpr_read_b32 v8, a10
-; GCN-NEXT: v_accvgpr_read_b32 v9, a11
-; GCN-NEXT: v_accvgpr_read_b32 v10, a12
-; GCN-NEXT: v_accvgpr_read_b32 v11, a13
-; GCN-NEXT: v_accvgpr_read_b32 v12, a14
-; GCN-NEXT: v_accvgpr_read_b32 v13, a15
-; GCN-NEXT: v_accvgpr_read_b32 v14, a16
-; GCN-NEXT: v_accvgpr_read_b32 v15, a17
-; GCN-NEXT: v_accvgpr_read_b32 v16, a18
-; GCN-NEXT: v_accvgpr_read_b32 v17, a19
-; GCN-NEXT: v_accvgpr_read_b32 v18, a20
-; GCN-NEXT: v_accvgpr_read_b32 v19, a21
-; GCN-NEXT: v_accvgpr_read_b32 v20, a22
-; GCN-NEXT: v_accvgpr_read_b32 v21, a23
-; GCN-NEXT: v_accvgpr_read_b32 v22, a24
-; GCN-NEXT: v_accvgpr_read_b32 v23, a25
-; GCN-NEXT: v_accvgpr_read_b32 v24, a26
-; GCN-NEXT: v_accvgpr_read_b32 v25, a27
-; GCN-NEXT: v_accvgpr_read_b32 v26, a28
-; GCN-NEXT: v_accvgpr_read_b32 v27, a29
-; GCN-NEXT: v_accvgpr_read_b32 v28, a30
-; GCN-NEXT: v_accvgpr_read_b32 v29, a31
-; GCN-NEXT: v_accvgpr_read_b32 v30, a32
-; GCN-NEXT: v_accvgpr_read_b32 v31, a33
+; GCN-NEXT: v_accvgpr_write_b32 a31, v21 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a30, v22 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a29, v23 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a28, v24 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a27, v25 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a26, v26 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a25, v27 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a24, v28 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a23, v29 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_write_b32 a22, v30 ; Reload Reuse
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GCN-NEXT: v_accvgpr_read_b32 v21, a31 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v22, a30 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v23, a29 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v24, a28 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v25, a27 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v26, a26 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v27, a25 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v28, a24 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v29, a23 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v30, a22 ; Reload Reuse
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_accvgpr_read_b32 v31, a21 ; Reload Reuse
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use v[0:31]
; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: buffer_load_dword a37, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword a36, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword a35, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword a34, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GCN-NEXT: v_accvgpr_read_b32 v63, a20 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v62, a19 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v61, a18 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v60, a17 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v59, a16 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v58, a15 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v57, a14 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v56, a13 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v47, a12 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v46, a11 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v45, a10 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v44, a9 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v43, a8 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v42, a7 ; Reload Reuse
+; GCN-NEXT: v_accvgpr_read_b32 v41, a6 ; Reload Reuse
; GCN-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %lds, i32 0, i32 10
%gep.1 = getelementptr inbounds [512 x i64], ptr addrspace(3) %lds, i32 0, i32 24
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index af817c3ee4eb1..1e7855ccb3642 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -1012,6 +1012,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: buffer_inv sc1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-SDAG-NEXT: s_cbranch_execz .LBB12_2
; GFX950-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private
@@ -1044,6 +1045,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: buffer_inv sc1
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-GISEL-NEXT: s_cbranch_execz .LBB12_2
; GFX950-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private
@@ -1167,6 +1169,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: buffer_inv sc1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-SDAG-NEXT: s_cbranch_execz .LBB13_2
; GFX950-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private
@@ -1203,6 +1206,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: buffer_inv sc1
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-GISEL-NEXT: s_cbranch_execz .LBB13_2
; GFX950-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private
diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
index d973f7b71fb6d..57bfd2490f9da 100644
--- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
@@ -19,11 +19,11 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add
; GFX90A-LABEL: half8:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx4 v[2:5], v0, s[0:1]
+; GFX90A-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[2:3]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX1030-LABEL: half8:
@@ -85,11 +85,11 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add
; GFX90A-LABEL: half6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx3 v[2:4], v0, s[0:1]
+; GFX90A-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx3 v0, v[2:4], s[2:3]
+; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX1030-LABEL: half6:
diff --git a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
index 554d4f69ea4a2..597f90c0f4e84 100644
--- a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
@@ -43,7 +43,8 @@ define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v16i32() #0 {
}
; ERR: error: <unknown>:0:0: in function illegal_agpr_to_sgpr_copy_i32 void (): illegal VGPR to SGPR copy
-; GCN: ; illegal copy a1 to s9
+; GCN: v_accvgpr_read_b32 [[COPY1:v[0-9]+]], a1
+; GCN: ; illegal copy [[COPY1]] to s9
define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 {
%agpr = call i32 asm sideeffect "; def $0", "=${a1}"()
call void asm sideeffect "; use $0", "${s9}"(i32 %agpr)
@@ -51,7 +52,9 @@ define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 {
}
; ERR: error: <unknown>:0:0: in function illegal_agpr_to_sgpr_copy_v2i32 void (): illegal VGPR to SGPR copy
-; GCN: ; illegal copy a[0:1] to s[10:11]
+; GCN-DAG: v_accvgpr_read_b32 v[[COPY1L:[0-9]+]], a0
+; GCN-DAG: v_accvgpr_read_b32 v[[COPY1H:[0-9]+]], a1
+; GCN: ; illegal copy v[[[COPY1L]]:[[COPY1H]]] to s[10:11]
define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_v2i32() #1 {
%vgpr = call <2 x i32> asm sideeffect "; def $0", "=${a[0:1]}"()
call void asm sideeffect "; use $0", "${s[10:11]}"(<2 x i32> %vgpr)
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
index 364d2f52777d3..b91963f08681c 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -49,10 +49,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_barrier
-; GFX90A-NEXT: ds_read_b32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: ds_read_b32 v0, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90A-NEXT: global_store_dword v1, v0, s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: barrier_release:
@@ -72,10 +72,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: s_barrier
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
-; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0
-; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-LABEL: barrier_release:
@@ -94,10 +94,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_barrier
-; GFX942-NEXT: ds_read_b32 v1, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: ds_read_b32 v0, v0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT: global_store_dword v1, v0, s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: barrier_release:
@@ -117,10 +117,10 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: s_barrier
; GFX942-TGSPLIT-NEXT: buffer_inv sc0
-; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0
-; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX10WGP-LABEL: barrier_release:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
index a57b43a81205b..3e96dfe40f745 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
@@ -37,11 +37,11 @@ entry:
define amdgpu_ps void @ds_read_b96_tr_b6(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX950-SDAG-LABEL: ds_read_b96_tr_b6:
; GFX950-SDAG: ; %bb.0: ; %entry
-; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[4:6], v0 offset:32
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: global_store_dwordx3 v[2:3], v[4:6], off
+; GFX950-SDAG-NEXT: global_store_dwordx3 v[4:5], v[0:2], off
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: ds_read_b96_tr_b6:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index fb32a83f3cf3c..7959cee49b93f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -294,17 +294,17 @@ define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias
; GCN-NEXT: ; iglp_opt mask(0x00000000)
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_u32_e32 v1, s0, v0
-; GCN-NEXT: ds_read_b32 v2, v1
+; GCN-NEXT: ds_read_b32 v1, v1
; GCN-NEXT: v_add_u32_e32 v0, s1, v0
-; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: ds_write_b32 v0, v2
+; GCN-NEXT: ds_write_b32 v0, v1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: ds_read_b32 v1, v1 offset:256
-; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: ds_read_b32 v0, v2 offset:256
+; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: ds_write_b32 v0, v1 offset:256
+; GCN-NEXT: ds_write_b32 v1, v0 offset:256
; GCN-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
index efd5df85280e6..49607e320bd0a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
@@ -39,7 +39,9 @@ define amdgpu_ps void @atomic_cmpswap_1d_agpr(<8 x i32> inreg %rsrc, i32 %s) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a1
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: image_atomic_cmpswap a[0:1], v0, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc
; GFX90A-NEXT: s_endpgm
%cmp = call i32 asm "; def $0", "=a"()
%swap = call i32 asm "; def $0", "=a"()
@@ -68,10 +70,14 @@ define amdgpu_ps void @atomic_cmpswap_1d_64_agpr(<8 x i32> inreg %rsrc, i32 %s)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def a[2:3]
+; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: image_atomic_cmpswap a[0:3], v0, s[0:7] dmask:0xf unorm glc
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc
; GFX90A-NEXT: s_endpgm
%cmp = call i64 asm "; def $0", "=a"()
%swap = call i64 asm "; def $0", "=a"()
@@ -86,7 +92,8 @@ define amdgpu_ps void @atomic_swap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: image_atomic_swap a0, v0, s[0:7] dmask:0x1 unorm glc
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0
+; GFX90A-NEXT: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc
; GFX90A-NEXT: s_endpgm
%data = call i32 asm "; def $0", "=a"()
%unused = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -99,7 +106,8 @@ define amdgpu_ps void @atomic_add_2d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s, i
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: image_atomic_add a0, v[0:1], s[0:7] dmask:0x1 unorm glc
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: image_atomic_add v2, v[0:1], s[0:7] dmask:0x1 unorm glc
; GFX90A-NEXT: s_endpgm
%data = call i32 asm "; def $0", "=a"()
%unused = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
@@ -115,7 +123,9 @@ define amdgpu_ps void @atomic_cmpswap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a1
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: image_atomic_cmpswap a[0:1], v0, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc
; GFX90A-NEXT: s_endpgm
%cmp = call i32 asm "; def $0", "=a"()
%swap = call i32 asm "; def $0", "=a"()
@@ -129,7 +139,9 @@ define amdgpu_ps void @atomic_swap_1d_i64_agpr_noret(<8 x i32> inreg %rsrc, i32
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: image_atomic_swap a[0:1], v0, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: image_atomic_swap v[2:3], v0, s[0:7] dmask:0x3 unorm glc
; GFX90A-NEXT: s_endpgm
%data = call i64 asm "; def $0", "=a"()
%unused = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -142,10 +154,14 @@ define amdgpu_ps void @atomic_cmpswap_1d_64_agpr_noret(<8 x i32> inreg %rsrc, i3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def a[2:3]
+; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: image_atomic_cmpswap a[0:3], v0, s[0:7] dmask:0xf unorm glc
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc
; GFX90A-NEXT: s_endpgm
%cmp = call i64 asm "; def $0", "=a"()
%swap = call i64 asm "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
index 92a5f88246888..12a998ad82cd2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
@@ -89,59 +89,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; GFX908-NEXT: s_endpgm
@@ -255,25 +255,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 9
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
; GFX908-NEXT: v_accvgpr_read_b32 v7, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a8
; GFX908-NEXT: v_accvgpr_read_b32 v11, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a4
; GFX908-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
; GFX908-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
; GFX908-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; GFX908-NEXT: s_endpgm
@@ -422,22 +422,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
; GFX908-NEXT: v_accvgpr_read_b32 v7, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a8
; GFX908-NEXT: v_accvgpr_read_b32 v11, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a4
; GFX908-NEXT: v_accvgpr_read_b32 v15, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a0
; GFX908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
; GFX908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
; GFX908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
index c21d86684e445..87a7c2ef6c95c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
@@ -8,10 +8,10 @@ define <4 x float> @default(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg
; HEURRC-LABEL: default:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
; HEURRC-NEXT: s_nop 7
@@ -34,10 +34,10 @@ define <4 x float> @request_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float>
; HEURRC-LABEL: request_agpr:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
; HEURRC-NEXT: s_nop 7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 22bc62acce15d..5ab8706f28f5f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -726,12 +726,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double
; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], 0
+; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], 0
; GFX90A-VGPR-NEXT: s_nop 3
-; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-VGPR-NEXT: s_nop 7
; GFX90A-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -742,12 +742,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double
; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], 0
+; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[4:5], v[0:1], v[2:3], 0
; GFX942-VGPR-NEXT: s_nop 3
-; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -765,10 +765,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: v_mov_b32_e32 v2, s10
; GFX90A-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s11
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v3, s11
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
@@ -779,7 +779,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
@@ -792,10 +792,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: v_mov_b32_e32 v2, s10
; GFX942-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
@@ -806,7 +806,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
@@ -819,17 +819,17 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX90A-VGPR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX90A-VGPR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, s10
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s10
; GFX90A-VGPR-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, s11
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s11
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 blgp:3
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 0
@@ -842,17 +842,17 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX942-VGPR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s10
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s10
; GFX942-VGPR-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s11
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s11
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 0
@@ -1629,20 +1629,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s3
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 1
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
@@ -1657,20 +1657,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s3
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
@@ -1743,20 +1743,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2
-; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s3
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-VGPR-NEXT: s_nop 1
-; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
; GFX90A-VGPR-NEXT: s_nop 15
; GFX90A-VGPR-NEXT: s_nop 1
; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
@@ -1771,20 +1771,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000
; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s3
; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-VGPR-NEXT: s_nop 1
-; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
; GFX942-VGPR-NEXT: s_nop 15
; GFX942-VGPR-NEXT: s_nop 1
; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index bc4822ef32a3d..dc4c929124fec 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -1445,20 +1445,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s6
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1467,38 +1467,38 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX942-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX950-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s6
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9]
+; GFX950-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1507,18 +1507,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX950-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
; GFX942-AGPRCD: ; %bb.0: ; %bb
@@ -1577,11 +1577,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX942-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s24
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s24
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1592,7 +1592,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_nop 9
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
@@ -1606,11 +1606,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX942-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s24
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s24
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1621,7 +1621,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
@@ -1635,11 +1635,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX950-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s24
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s24
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1650,7 +1650,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_nop 10
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
@@ -1664,11 +1664,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX950-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s24
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s24
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1679,7 +1679,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
@@ -1847,20 +1847,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s6
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1869,38 +1869,38 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX942-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX950-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s6
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9]
+; GFX950-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1909,18 +1909,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX950-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[8:9]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
; GFX942-AGPRCD: ; %bb.0: ; %bb
@@ -1979,11 +1979,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX942-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s24
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s24
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -1994,7 +1994,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_nop 9
; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
@@ -2008,11 +2008,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX942-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s24
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s24
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -2023,7 +2023,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
@@ -2037,11 +2037,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX950-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s24
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s24
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -2052,7 +2052,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_nop 10
; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
@@ -2066,11 +2066,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX950-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s24
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s24
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
@@ -2081,7 +2081,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
@@ -2275,21 +2275,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2322,21 +2322,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -2495,15 +2495,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s20, s18
; GFX942-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -2512,7 +2512,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -2560,15 +2560,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s20, s18
; GFX950-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -2577,7 +2577,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -2789,21 +2789,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2836,21 +2836,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -3000,21 +3000,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -3047,21 +3047,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -3211,21 +3211,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3258,21 +3258,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -3422,21 +3422,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s14
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3469,21 +3469,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-GISEL-NEXT: s_mov_b32 s5, s3
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[8:9]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s14
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v0 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[6:9], s[12:13]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -3642,15 +3642,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s20, s18
; GFX942-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3659,7 +3659,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -3707,15 +3707,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s20, s18
; GFX950-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3724,7 +3724,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -3945,15 +3945,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s20, s18
; GFX942-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3962,7 +3962,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -4010,15 +4010,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s20, s18
; GFX950-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4027,7 +4027,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -4248,15 +4248,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s20, s18
; GFX942-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4265,7 +4265,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -4313,15 +4313,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s20, s18
; GFX950-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4330,7 +4330,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -4551,15 +4551,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s20, s18
; GFX942-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, s26
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4568,7 +4568,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
@@ -4616,15 +4616,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
; GFX950-GISEL-NEXT: s_mov_b32 s20, s18
; GFX950-GISEL-NEXT: s_mov_b32 s21, s19
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s26
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4633,7 +4633,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 68e3afe8b449a..033a35f69a0bd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -15,15 +15,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GCN-NEXT: v_mov_b64_e32 v[0:1], 48
-; GCN-NEXT: v_mov_b64_e32 v[2:3], 32
-; GCN-NEXT: v_mov_b64_e32 v[4:5], 16
+; GCN-NEXT: v_mov_b64_e32 v[8:9], 48
+; GCN-NEXT: v_mov_b64_e32 v[10:11], 32
+; GCN-NEXT: v_mov_b64_e32 v[12:13], 16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
-; GCN-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
-; GCN-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
; GCN-NEXT: v_accvgpr_write_b32 a0, s8
-; GCN-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
+; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
; GCN-NEXT: v_accvgpr_write_b32 a1, s9
; GCN-NEXT: v_accvgpr_write_b32 a2, s10
; GCN-NEXT: v_accvgpr_write_b32 a3, s11
@@ -41,39 +41,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
; GCN-NEXT: v_mov_b32_e32 v16, s16
; GCN-NEXT: v_mov_b32_e32 v17, s17
-; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[8:11], v[12:15], a[0:15]
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15]
; GCN-NEXT: v_mov_b32_e32 v18, s18
; GCN-NEXT: v_mov_b32_e32 v19, s19
-; GCN-NEXT: v_mov_b32_e32 v8, s20
-; GCN-NEXT: v_mov_b32_e32 v9, s21
-; GCN-NEXT: v_mov_b32_e32 v10, s22
-; GCN-NEXT: v_mov_b32_e32 v11, s23
-; GCN-NEXT: v_mov_b64_e32 v[6:7], 0
+; GCN-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NEXT: v_mov_b32_e32 v1, s21
+; GCN-NEXT: v_mov_b32_e32 v2, s22
+; GCN-NEXT: v_mov_b32_e32 v3, s23
+; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
; GCN-NEXT: s_nop 4
-; GCN-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -87,15 +88,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GCN-NEXT: v_mov_b64_e32 v[0:1], 48
-; GCN-NEXT: v_mov_b64_e32 v[2:3], 32
-; GCN-NEXT: v_mov_b64_e32 v[4:5], 16
+; GCN-NEXT: v_mov_b64_e32 v[8:9], 48
+; GCN-NEXT: v_mov_b64_e32 v[10:11], 32
+; GCN-NEXT: v_mov_b64_e32 v[12:13], 16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
-; GCN-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
-; GCN-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
; GCN-NEXT: v_accvgpr_write_b32 a0, s8
-; GCN-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
+; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
; GCN-NEXT: v_accvgpr_write_b32 a1, s9
; GCN-NEXT: v_accvgpr_write_b32 a2, s10
; GCN-NEXT: v_accvgpr_write_b32 a3, s11
@@ -113,39 +114,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
; GCN-NEXT: v_mov_b32_e32 v16, s16
; GCN-NEXT: v_mov_b32_e32 v17, s17
-; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; GCN-NEXT: v_mov_b32_e32 v18, s18
; GCN-NEXT: v_mov_b32_e32 v19, s19
-; GCN-NEXT: v_mov_b32_e32 v8, s20
-; GCN-NEXT: v_mov_b32_e32 v9, s21
-; GCN-NEXT: v_mov_b32_e32 v10, s22
-; GCN-NEXT: v_mov_b32_e32 v11, s23
-; GCN-NEXT: v_mov_b64_e32 v[6:7], 0
+; GCN-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NEXT: v_mov_b32_e32 v1, s21
+; GCN-NEXT: v_mov_b32_e32 v2, s22
+; GCN-NEXT: v_mov_b32_e32 v3, s23
+; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
; GCN-NEXT: s_nop 4
-; GCN-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
@@ -158,22 +160,22 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b
; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a15, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a4, v12
+; GCN-NEXT: v_accvgpr_write_b32 a5, v13
+; GCN-NEXT: v_accvgpr_write_b32 a6, v14
+; GCN-NEXT: v_accvgpr_write_b32 a7, v15
+; GCN-NEXT: v_accvgpr_write_b32 a8, v16
+; GCN-NEXT: v_accvgpr_write_b32 a9, v17
+; GCN-NEXT: v_accvgpr_write_b32 a10, v18
+; GCN-NEXT: v_accvgpr_write_b32 a11, v19
+; GCN-NEXT: v_accvgpr_write_b32 a12, v20
+; GCN-NEXT: v_accvgpr_write_b32 a13, v21
+; GCN-NEXT: v_accvgpr_write_b32 a14, v22
+; GCN-NEXT: v_accvgpr_write_b32 a15, v23
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
; GCN-NEXT: s_nop 11
@@ -202,22 +204,22 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0,
; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac__flags:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a15, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a4, v12
+; GCN-NEXT: v_accvgpr_write_b32 a5, v13
+; GCN-NEXT: v_accvgpr_write_b32 a6, v14
+; GCN-NEXT: v_accvgpr_write_b32 a7, v15
+; GCN-NEXT: v_accvgpr_write_b32 a8, v16
+; GCN-NEXT: v_accvgpr_write_b32 a9, v17
+; GCN-NEXT: v_accvgpr_write_b32 a10, v18
+; GCN-NEXT: v_accvgpr_write_b32 a11, v19
+; GCN-NEXT: v_accvgpr_write_b32 a12, v20
+; GCN-NEXT: v_accvgpr_write_b32 a13, v21
+; GCN-NEXT: v_accvgpr_write_b32 a14, v22
+; GCN-NEXT: v_accvgpr_write_b32 a15, v23
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
; GCN-NEXT: s_nop 11
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 03bf33e0d17e6..753206206180a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -12,45 +12,29 @@ declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>,
; --------------------------------------------------------------------
define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_f32_16x16x32_f16:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_f32_16x16x32_f16:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_f32_16x16x32_f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
; HEURRC-NEXT: s_nop 7
@@ -90,45 +74,29 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg
}
define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_f32_16x16x32_f16__flags:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_f32_16x16x32_f16__flags:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_f32_16x16x32_f16__flags:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16__flags:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
; HEURRC-NEXT: s_nop 7
@@ -414,15 +382,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -440,39 +408,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15]
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
; SDAG-NEXT: v_mov_b32_e32 v18, s18
; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
; SDAG-NEXT: s_nop 4
-; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -480,15 +449,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -504,33 +473,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: s_nop 8
-; GISEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[2:3], a[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[4:5], a[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[6:7], a[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[0:1], v[16:19], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -538,15 +508,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
@@ -564,39 +534,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
; HEURRC-NEXT: v_mov_b32_e32 v16, s16
; HEURRC-NEXT: v_mov_b32_e32 v17, s17
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15]
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
; HEURRC-NEXT: v_mov_b32_e32 v18, s18
; HEURRC-NEXT: v_mov_b32_e32 v19, s19
-; HEURRC-NEXT: v_mov_b32_e32 v8, s20
-; HEURRC-NEXT: v_mov_b32_e32 v9, s21
-; HEURRC-NEXT: v_mov_b32_e32 v10, s22
-; HEURRC-NEXT: v_mov_b32_e32 v11, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s20
+; HEURRC-NEXT: v_mov_b32_e32 v1, s21
+; HEURRC-NEXT: v_mov_b32_e32 v2, s22
+; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
; HEURRC-NEXT: s_nop 4
-; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -604,15 +575,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
-; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -622,40 +593,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; VGPRRC-NEXT: v_mov_b32_e32 v48, s16
; VGPRRC-NEXT: v_mov_b32_e32 v49, s17
-; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[40:43], v[44:47], v[0:15]
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
-; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
; VGPRRC-NEXT: s_nop 8
-; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[48:51], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16:
@@ -794,15 +765,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -820,39 +791,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; SDAG-NEXT: v_mov_b32_e32 v18, s18
; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
; SDAG-NEXT: s_nop 4
-; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -860,15 +832,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -884,33 +856,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: s_nop 8
-; GISEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[2:3], a[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[4:5], a[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[6:7], a[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[0:1], v[16:19], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -918,15 +891,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
@@ -944,39 +917,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
; HEURRC-NEXT: v_mov_b32_e32 v16, s16
; HEURRC-NEXT: v_mov_b32_e32 v17, s17
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; HEURRC-NEXT: v_mov_b32_e32 v18, s18
; HEURRC-NEXT: v_mov_b32_e32 v19, s19
-; HEURRC-NEXT: v_mov_b32_e32 v8, s20
-; HEURRC-NEXT: v_mov_b32_e32 v9, s21
-; HEURRC-NEXT: v_mov_b32_e32 v10, s22
-; HEURRC-NEXT: v_mov_b32_e32 v11, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s20
+; HEURRC-NEXT: v_mov_b32_e32 v1, s21
+; HEURRC-NEXT: v_mov_b32_e32 v2, s22
+; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
; HEURRC-NEXT: s_nop 4
-; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -984,15 +958,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
-; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -1002,40 +976,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; VGPRRC-NEXT: v_mov_b32_e32 v48, s16
; VGPRRC-NEXT: v_mov_b32_e32 v49, s17
-; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[40:43], v[44:47], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
-; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
; VGPRRC-NEXT: s_nop 8
-; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[48:51], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__flags:
@@ -1170,105 +1144,65 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
}
define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_f32_32x32x16_f16__mac:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_f32_32x32x16_f16__mac:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a4, v12
+; GCN-NEXT: v_accvgpr_write_b32 a5, v13
+; GCN-NEXT: v_accvgpr_write_b32 a6, v14
+; GCN-NEXT: v_accvgpr_write_b32 a7, v15
+; GCN-NEXT: v_accvgpr_write_b32 a8, v16
+; GCN-NEXT: v_accvgpr_write_b32 a9, v17
+; GCN-NEXT: v_accvgpr_write_b32 a10, v18
+; GCN-NEXT: v_accvgpr_write_b32 a11, v19
+; GCN-NEXT: v_accvgpr_write_b32 a12, v20
+; GCN-NEXT: v_accvgpr_write_b32 a13, v21
+; GCN-NEXT: v_accvgpr_write_b32 a14, v22
+; GCN-NEXT: v_accvgpr_write_b32 a15, v23
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
+; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
; HEURRC-NEXT: s_nop 11
@@ -1380,105 +1314,65 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
}
define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a4, v12
+; GCN-NEXT: v_accvgpr_write_b32 a5, v13
+; GCN-NEXT: v_accvgpr_write_b32 a6, v14
+; GCN-NEXT: v_accvgpr_write_b32 a7, v15
+; GCN-NEXT: v_accvgpr_write_b32 a8, v16
+; GCN-NEXT: v_accvgpr_write_b32 a9, v17
+; GCN-NEXT: v_accvgpr_write_b32 a10, v18
+; GCN-NEXT: v_accvgpr_write_b32 a11, v19
+; GCN-NEXT: v_accvgpr_write_b32 a12, v20
+; GCN-NEXT: v_accvgpr_write_b32 a13, v21
+; GCN-NEXT: v_accvgpr_write_b32 a14, v22
+; GCN-NEXT: v_accvgpr_write_b32 a15, v23
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
+; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
; HEURRC-NEXT: s_nop 11
@@ -2642,45 +2536,29 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32>, <4 x i32>, <4 x i32>, i32 immarg, i32 immarg, i32 immarg)
define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) {
-; SDAG-LABEL: test_mfma_i32_16x16x64_i8:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_i32_16x16x64_i8:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_i32_16x16x64_i8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_i32_16x16x64_i8:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
; HEURRC-NEXT: s_nop 7
@@ -2720,45 +2598,29 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4
}
define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) {
-; SDAG-LABEL: test_mfma_i32_16x16x64_i8__flags:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_i32_16x16x64_i8__flags:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_i32_16x16x64_i8__flags:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_i32_16x16x64_i8__flags:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
; HEURRC-NEXT: s_nop 7
@@ -3173,15 +3035,15 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3197,33 +3059,34 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: s_nop 8
-; GISEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[2:3], a[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[4:5], a[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[6:7], a[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[0:1], v[16:19], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -3584,15 +3447,15 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3608,33 +3471,34 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: s_nop 8
-; GISEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[2:3], a[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[4:5], a[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[6:7], a[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[0:1], v[16:19], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -3920,105 +3784,65 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
}
define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) {
-; SDAG-LABEL: test_mfma_i32_32x32x32_i8__mac:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_i32_32x32x32_i8__mac:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a4, v12
+; GCN-NEXT: v_accvgpr_write_b32 a5, v13
+; GCN-NEXT: v_accvgpr_write_b32 a6, v14
+; GCN-NEXT: v_accvgpr_write_b32 a7, v15
+; GCN-NEXT: v_accvgpr_write_b32 a8, v16
+; GCN-NEXT: v_accvgpr_write_b32 a9, v17
+; GCN-NEXT: v_accvgpr_write_b32 a10, v18
+; GCN-NEXT: v_accvgpr_write_b32 a11, v19
+; GCN-NEXT: v_accvgpr_write_b32 a12, v20
+; GCN-NEXT: v_accvgpr_write_b32 a13, v21
+; GCN-NEXT: v_accvgpr_write_b32 a14, v22
+; GCN-NEXT: v_accvgpr_write_b32 a15, v23
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
+; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
; HEURRC-NEXT: s_nop 11
@@ -4130,105 +3954,65 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
}
define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) {
-; SDAG-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a4, v12
+; GCN-NEXT: v_accvgpr_write_b32 a5, v13
+; GCN-NEXT: v_accvgpr_write_b32 a6, v14
+; GCN-NEXT: v_accvgpr_write_b32 a7, v15
+; GCN-NEXT: v_accvgpr_write_b32 a8, v16
+; GCN-NEXT: v_accvgpr_write_b32 a9, v17
+; GCN-NEXT: v_accvgpr_write_b32 a10, v18
+; GCN-NEXT: v_accvgpr_write_b32 a11, v19
+; GCN-NEXT: v_accvgpr_write_b32 a12, v20
+; GCN-NEXT: v_accvgpr_write_b32 a13, v21
+; GCN-NEXT: v_accvgpr_write_b32 a14, v22
+; GCN-NEXT: v_accvgpr_write_b32 a15, v23
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
+; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
; HEURRC-NEXT: s_nop 11
@@ -5515,10 +5299,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat>
; GCN-LABEL: test_mfma_f32_16x16x32_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
; GCN-NEXT: s_nop 7
@@ -5531,10 +5315,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat>
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
; HEURRC-NEXT: s_nop 7
@@ -5577,10 +5361,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x
; GCN-LABEL: test_mfma_f32_16x16x32_bf16__flags:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
; GCN-NEXT: s_nop 7
@@ -5593,10 +5377,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16__flags:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
; HEURRC-NEXT: s_nop 7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
index c1946630ef5f1..d24f1f0b526c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
@@ -52,26 +52,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a7
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a2
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a3
-; GFX908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
-; GFX908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
-; GFX908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
-; GFX908-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17]
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_i32_32x32x8i8:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 3d9ebf91e8f47..7e30af96bb8b9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -99,59 +99,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; NOLIT-SRCC-NEXT: s_endpgm
@@ -234,59 +234,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; LIT-SRCC-NEXT: s_endpgm
@@ -510,25 +510,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; NOLIT-SRCC-NEXT: s_endpgm
@@ -577,25 +577,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; LIT-SRCC-NEXT: s_endpgm
@@ -864,22 +864,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
@@ -931,22 +931,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
@@ -1257,59 +1257,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:96
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:112
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:64
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:80
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:32
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:48
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37]
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:16
; NOLIT-SRCC-NEXT: s_endpgm
@@ -1396,59 +1396,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:96
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:112
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
-; LIT-SRCC-NEXT: s_nop 1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
+; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:64
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:80
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:32
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:48
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37]
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:16
; LIT-SRCC-NEXT: s_endpgm
@@ -1690,25 +1690,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; NOLIT-SRCC-NEXT: s_endpgm
@@ -1760,25 +1760,25 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; LIT-SRCC-NEXT: s_endpgm
@@ -2080,22 +2080,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
@@ -2150,22 +2150,22 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
@@ -2425,7 +2425,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-LABEL: test_mfma_i32_32x32x4i8:
; NOLIT-SRCC: ; %bb.0: ; %bb
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; NOLIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
@@ -2482,7 +2482,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s10
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s11
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s12
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s20
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s20
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1
@@ -2491,7 +2491,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s14
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s15
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, 1
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2
@@ -2500,67 +2500,53 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
-; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
-; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
-; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
-; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
-; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
-; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
-; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
-; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
-; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
-; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
-; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a25
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a24
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:96
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a31
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a30
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a29
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a28
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a3
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:112
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a0
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:64
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a15
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a13
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:80
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a12
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a11
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a8
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:32
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[34:35] offset:48
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[34:35]
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[34:35] offset:16
; NOLIT-SRCC-NEXT: s_endpgm
;
; LIT-SRCC-LABEL: test_mfma_i32_32x32x4i8:
; LIT-SRCC: ; %bb.0: ; %bb
; LIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; LIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
@@ -2617,7 +2603,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s10
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s11
; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s12
-; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s20
+; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s20
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1
@@ -2626,7 +2612,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s14
; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s15
; LIT-SRCC-NEXT: v_mov_b32_e32 v3, 1
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2
@@ -2635,61 +2621,47 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; LIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
-; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
-; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
-; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
-; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
-; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
-; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
-; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
-; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
-; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
-; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
-; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a25
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a24
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:96
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a31
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a30
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a29
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a28
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a3
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:112
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a0
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:64
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a15
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a13
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:80
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a12
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a11
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a8
+; LIT-SRCC-NEXT: s_nop 1
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:32
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[34:35] offset:48
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[34:35]
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[34:35] offset:16
; LIT-SRCC-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_i32_32x32x4i8:
@@ -2871,134 +2843,134 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC: ; %bb.0: ; %bb
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v12, 0
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s0
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s0
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v13
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s3
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s3
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v13
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s4
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s5
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s6
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s6
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v13
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s7
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s8
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s9
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s9
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v13
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s10
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s11
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s12
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s12
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v13
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s15
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s15
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v13
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; NOLIT-SRCC-NEXT: s_endpgm
;
; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8:
; LIT-SRCC: ; %bb.0: ; %bb
; LIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
-; LIT-SRCC-NEXT: v_mov_b32_e32 v12, 0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s0
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1
; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v13
-; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s3
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17
+; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s3
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v13
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s4
; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s5
-; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s6
+; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s6
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v13
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s7
; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s8
-; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s9
+; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s9
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v13
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s10
; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s11
-; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s12
+; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s12
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v13
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13
; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14
-; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s15
+; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s15
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v13
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
-; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
-; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; LIT-SRCC-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_i32_16x16x4i8:
@@ -3123,37 +3095,30 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 64
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
+; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; NOLIT-SRCC-NEXT: s_endpgm
;
; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
@@ -3161,33 +3126,30 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; LIT-SRCC-NEXT: v_mov_b32_e32 v8, 0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48
-; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:32
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
-; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
-; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
+; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; LIT-SRCC-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
@@ -3632,59 +3594,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; NOLIT-SRCC-NEXT: s_endpgm
@@ -3768,59 +3730,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; LIT-SRCC-NEXT: s_endpgm
@@ -4049,22 +4011,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; NOLIT-SRCC-NEXT: s_nop 9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
@@ -4116,22 +4078,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; LIT-SRCC-NEXT: s_nop 9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
@@ -4478,32 +4440,32 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; NOLIT-SRCC-NEXT: s_nop 9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; NOLIT-SRCC-NEXT: s_nop 0
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; NOLIT-SRCC-NEXT: s_endpgm
@@ -4516,28 +4478,28 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
; LIT-SRCC-NEXT: v_mov_b32_e32 v8, 0
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0
; LIT-SRCC-NEXT: s_nop 9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48
; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:32
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; LIT-SRCC-NEXT: s_endpgm
@@ -4622,32 +4584,32 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15]
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; NOLIT-SRCC-NEXT: s_nop 0
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; NOLIT-SRCC-NEXT: s_endpgm
@@ -4659,31 +4621,33 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
; LIT-SRCC-NEXT: v_mov_b32_e32 v2, 0x40004000
; LIT-SRCC-NEXT: v_mov_b32_e32 v3, v2
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v13, 0
; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a3
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:48
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:32
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1]
+; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[0:3], s[0:1] offset:48
+; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[4:7], s[0:1] offset:32
+; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[8:11], s[0:1] offset:16
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a0
+; LIT-SRCC-NEXT: s_nop 1
+; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[9:12], s[0:1]
; LIT-SRCC-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_f32_32x32x8f16_imm_splat:
@@ -4787,60 +4751,60 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; NOLIT-SRCC-NEXT: s_nop 0
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; NOLIT-SRCC-NEXT: s_endpgm
@@ -4850,55 +4814,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0
; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0
; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v14, 0
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a24
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a25
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a26
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a27
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a20
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a21
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a22
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a26
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a25
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a24
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a23
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a22
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a21
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a20
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:112
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:96
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:80
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:112
+; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:96
+; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:80
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:48
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a19
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a18
+; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:48
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a17
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a16
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:32
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:64
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:32
+; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:64
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16
+; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1]
; LIT-SRCC-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_f32_32x32x1f32_imm_splat:
@@ -5091,32 +5055,32 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; NOLIT-SRCC-NEXT: s_nop 9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; NOLIT-SRCC-NEXT: s_nop 0
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; NOLIT-SRCC-NEXT: s_endpgm
@@ -5145,32 +5109,32 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; LIT-SRCC-NEXT: s_nop 9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; LIT-SRCC-NEXT: s_nop 0
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; LIT-SRCC-NEXT: s_endpgm
@@ -5313,60 +5277,60 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; NOLIT-SRCC-NEXT: s_nop 0
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; NOLIT-SRCC-NEXT: s_nop 1
; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; NOLIT-SRCC-NEXT: s_endpgm
@@ -5412,60 +5376,60 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
; LIT-SRCC-NEXT: s_nop 0
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
; LIT-SRCC-NEXT: s_nop 1
; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; LIT-SRCC-NEXT: s_endpgm
@@ -5916,40 +5880,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 15
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0
; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
; NOLIT-SRCC-NEXT: s_nop 0
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
@@ -6011,40 +5975,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 15
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0
; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
; LIT-SRCC-NEXT: s_nop 0
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
-; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4
; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 52dcfb735a899..aae14c8cc87b3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -14,37 +14,21 @@
; fp8 x fp8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -53,37 +37,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -92,37 +60,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0]
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -131,37 +83,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -170,37 +106,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -209,37 +129,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -248,37 +152,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -287,37 +175,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -327,37 +199,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -367,37 +223,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons
; fp8 x bf8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 1, ; blgp
@@ -407,37 +247,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 1, ; blgp
@@ -447,37 +271,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons
; fp8 x fp6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 2, ; blgp
@@ -487,37 +295,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 2, ; blgp
@@ -527,37 +319,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons
; fp8 x bf6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 3, ; blgp
@@ -567,37 +343,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 3, ; blgp
@@ -607,37 +367,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons
; fp8 x fp4
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 4, ; blgp
@@ -647,37 +391,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 4, ; blgp
@@ -687,37 +415,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons
; bf8 x fp8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 0, ; blgp
@@ -727,37 +439,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 0, ; blgp
@@ -767,37 +463,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons
; bf8 x bf8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 1, ; blgp
@@ -808,37 +488,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 1, ; blgp
@@ -848,37 +512,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons
; bf8 x fp6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 2, ; blgp
@@ -887,37 +535,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 2, ; blgp
@@ -927,37 +559,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons
; bf8 x bf6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 3, ; blgp
@@ -967,37 +583,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 3, ; blgp
@@ -1007,37 +607,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons
; bf8 x fp4
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 4, ; blgp
@@ -1047,37 +631,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
i32 4, ; blgp
@@ -1087,37 +655,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons
; fp6 x fp8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 0, ; blgp
@@ -1127,37 +679,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 0, ; blgp
@@ -1167,37 +703,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons
; fp6 x bf8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 1, ; blgp
@@ -1207,37 +727,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 1, ; blgp
@@ -1247,37 +751,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons
; fp6 x fp6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 2, ; blgp
@@ -1287,37 +775,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 2, ; blgp
@@ -1327,37 +799,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__cons
; fp6 x bf6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 3, ; blgp
@@ -1367,37 +823,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 3, ; blgp
@@ -1408,37 +848,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__cons
; bf6 x fp8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 0, ; blgp
@@ -1448,37 +872,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 0, ; blgp
@@ -1488,37 +896,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons
; bf6 x bf8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 1, ; blgp
@@ -1528,37 +920,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 1, ; blgp
@@ -1568,37 +944,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons
; bf6 x fp6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 2, ; blgp
@@ -1608,37 +968,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 2, ; blgp
@@ -1648,37 +992,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__cons
; bf6 x fp4
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 4, ; blgp
@@ -1688,37 +1016,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 4, ; blgp
@@ -1728,37 +1040,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__cons
; bf6 x bf6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 3, ; blgp
@@ -1768,37 +1064,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
i32 3, ; blgp
@@ -1808,37 +1088,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__cons
; fp6 x fp4
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 4, ; blgp
@@ -1848,37 +1112,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
i32 4, ; blgp
@@ -1888,37 +1136,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__cons
; fp4 x fp8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 0, ; blgp
@@ -1928,37 +1160,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 0, ; blgp
@@ -1968,37 +1184,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons
; fp4 x bf8
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 1, ; blgp
@@ -2008,37 +1208,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 1, ; blgp
@@ -2048,77 +1232,45 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons
; fp4 x fp6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
- i32 4, ; cbsz
- i32 2, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <4 x float> %result
-}
-
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+ i32 4, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 2, ; blgp
@@ -2128,37 +1280,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__cons
; fp4 x bf6
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 3, ; blgp
@@ -2168,37 +1304,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 3, ; blgp
@@ -2208,37 +1328,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__cons
; fp4 x fp4
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 4, ; blgp
@@ -2248,37 +1352,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 4, ; blgp
@@ -2291,17 +1379,97 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__cons
; --------------------------------------------------------------------
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: v_mov_b32_e32 v17, s1
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, s0
-; SDAG-NEXT: v_mov_b32_e32 v17, s1
+; SDAG-NEXT: v_mov_b32_e32 v14, s0
+; SDAG-NEXT: v_mov_b32_e32 v15, s1
+; SDAG-NEXT: v_mov_b32_e32 v16, s2
+; SDAG-NEXT: v_mov_b32_e32 v17, s3
+; SDAG-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s24
+; SDAG-NEXT: v_mov_b32_e32 v11, s25
+; SDAG-NEXT: v_mov_b32_e32 v12, s26
+; SDAG-NEXT: v_mov_b32_e32 v13, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v4
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v5
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -2309,17 +1477,29 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_mov_b32_e32 v16, s0
-; GISEL-NEXT: v_mov_b32_e32 v17, s1
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b32_e32 v20, s28
+; GISEL-NEXT: v_mov_b32_e32 v21, s29
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v0
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v1
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
@@ -2330,148 +1510,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_
ret <4 x float> %result
}
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, s0
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_mov_b32_e32 v16, s0
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, s0
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_mov_b32_e32 v16, s0
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v12, s0
-; SDAG-NEXT: v_mov_b32_e32 v13, s1
-; SDAG-NEXT: v_mov_b32_e32 v14, s2
-; SDAG-NEXT: v_mov_b32_e32 v15, s3
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
-; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mov_b32_e32 v18, s18
-; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
-; SDAG-NEXT: v_mov_b32_e32 v8, s24
-; SDAG-NEXT: v_mov_b32_e32 v9, s25
-; SDAG-NEXT: v_mov_b32_e32 v10, s26
-; SDAG-NEXT: v_mov_b32_e32 v11, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s28
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s29
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s12, s0
-; GISEL-NEXT: s_mov_b32 s13, s1
-; GISEL-NEXT: s_mov_b32 s14, s2
-; GISEL-NEXT: s_mov_b32 s15, s3
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v0
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v1
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s28
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s29
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v14, s0
@@ -2482,10 +1522,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
; SDAG-NEXT: v_mov_b32_e32 v19, s17
; SDAG-NEXT: v_mov_b32_e32 v20, s18
; SDAG-NEXT: v_mov_b32_e32 v21, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0]
@@ -2536,10 +1576,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
; SDAG-NEXT: v_mov_b32_e32 v19, s17
; SDAG-NEXT: v_mov_b32_e32 v20, s18
; SDAG-NEXT: v_mov_b32_e32 v21, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0]
@@ -2582,10 +1622,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
; SDAG-NEXT: v_mov_b32_e32 v14, s0
; SDAG-NEXT: v_mov_b32_e32 v15, s1
; SDAG-NEXT: v_mov_b32_e32 v16, s2
@@ -2594,6 +1630,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-NEXT: v_mov_b32_e32 v19, s17
; SDAG-NEXT: v_mov_b32_e32 v20, s18
; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0]
@@ -2612,13 +1652,13 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
; GISEL-NEXT: v_mov_b32_e32 v8, s20
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0]
@@ -2711,14 +1751,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_mov_b32_e32 v20, -2
+; SDAG-NEXT: v_mov_b32_e32 v21, 33
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, -2
-; SDAG-NEXT: v_mov_b32_e32 v17, 33
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -2751,14 +1791,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_mov_b32_e32 v20, -2
+; SDAG-NEXT: v_mov_b32_e32 v21, 0x41
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, -2
-; SDAG-NEXT: v_mov_b32_e32 v17, 0x41
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -2791,14 +1831,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_mov_b32_e32 v20, 0x4d
+; SDAG-NEXT: v_mov_b32_e32 v21, 0x41
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d
-; SDAG-NEXT: v_mov_b32_e32 v17, 0x41
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -3145,328 +2185,58 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; This should be optimized to avoid the scale
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
ret <4 x float> %result
}
; This should be optimized to avoid the scale, with non-0 op_sel arguments.
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0)
- ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, 1
-; SDAG-NEXT: v_mov_b32_e32 v17, 0
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-NEXT: v_mov_b32_e32 v17, 1
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
- ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: v_mov_b32_e32 v17, 1
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v17, v16 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_mov_b32_e32 v16, 1
-; GISEL-NEXT: v_mov_b32_e32 v17, 0
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
- ret <4 x float> %result
-}
-
-; --------------------------------------------------------------------
-; Incorrect signature for format cases (IR vector too large)
-; --------------------------------------------------------------------
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
- i32 0, ; cbsz
- i32 2, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
- i32 2, ; cbsz
- i32 0, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
- i32 2, ; cbsz
- i32 2, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
- i32 2, ; cbsz
- i32 2, ; blgp
- i32 0, i32 0, i32 0, i32 0)
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0)
ret <4 x float> %result
}
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4:
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_mov_b32_e32 v20, 1
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -3474,38 +2244,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4:
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-NEXT: v_mov_b32_e32 v17, 1
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
- i32 0, ; cbsz
- i32 4, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
ret <4 x float> %result
}
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8:
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: v_mov_b32_e32 v21, 1
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -3513,21 +2284,162 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8:
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_mov_b32_e32 v16, 1
+; GISEL-NEXT: v_mov_b32_e32 v17, 0
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+; --------------------------------------------------------------------
+; Incorrect signature for format cases (IR vector too large)
+; --------------------------------------------------------------------
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+ i32 0, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 0, ; blgp
@@ -3536,37 +2448,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
i32 4, ; blgp
@@ -3575,37 +2471,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 0, ; blgp
@@ -3614,37 +2494,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 4, ; blgp
@@ -3653,37 +2517,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4
+; GCN-NEXT: s_nop 7
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
i32 4, ; blgp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 7b7865e3434db..f0205a3a788ed 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -17,27 +17,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -61,11 +61,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -81,7 +81,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -112,27 +112,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,1,0] op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -156,11 +156,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -176,7 +176,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,1,0] op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -207,27 +207,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -251,11 +251,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -271,7 +271,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -302,27 +302,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -346,11 +346,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -366,7 +366,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -397,27 +397,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[0,1,0] op_sel_hi:[0,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[0,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -441,11 +441,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -461,7 +461,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[0,1,0] op_sel_hi:[0,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -492,27 +492,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -536,11 +536,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -556,7 +556,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -587,27 +587,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[0,1,0] op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -631,11 +631,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -651,7 +651,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[0,1,0] op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -682,27 +682,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel:[1,0,0] op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -726,11 +726,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -746,7 +746,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,0,0] op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -775,89 +775,47 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x
; This should be optimized to avoid the scale
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 0, ; blgp
@@ -870,27 +828,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:1
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -914,11 +872,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -934,7 +892,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] blgp:1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:1
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -962,89 +920,47 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 1, ; blgp
@@ -1054,91 +970,48 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons
; fp8 x fp6
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: scratch_load_dword v14, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: scratch_load_dword v14, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:2
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 2, ; blgp
@@ -1147,87 +1020,46 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
i32 2, ; blgp
@@ -1237,29 +1069,226 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__cons
; fp8 x bf6
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3:
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 3, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; fp8 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; bf8 x fp8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: scratch_load_dword v14, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:3
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1280,29 +1309,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: scratch_load_dword v14, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:3
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1322,307 +1352,88 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 0, ; cbsz
- i32 3, ; blgp
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 0, ; cbsz
- i32 3, ; blgp
- i32 0, i32 0, i32 0, i32 0)
- ret <16 x float> %result
-}
-
-; fp8 x fp4
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 0, ; cbsz
- i32 4, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 0, ; cbsz
- i32 4, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 0, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; bf8 x fp8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
+; bf8 x bf8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1643,14 +1454,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -1666,7 +1477,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1688,3087 +1499,1992 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
- i32 0, ; blgp
+ i32 1, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
- i32 0, ; blgp
+ i32 1, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; bf8 x bf8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:1 blgp:1
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:1
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+; bf8 x fp6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
- i32 1, ; blgp
+ i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+; bf8 x bf6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
- i32 1, ; blgp
- i32 0, i32 0, i32 0, i32 0)
+ i32 3, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-; bf8 x fp6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: scratch_load_dword v14, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:2
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: scratch_load_dword v14, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:2
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
- i32 2, ; blgp
+ i32 3, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; bf8 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 1, ; cbsz
+ i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
- i32 2, ; blgp
+ i32 4, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; bf8 x bf6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: scratch_load_dword v14, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:3
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: scratch_load_dword v14, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:3
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 1, ; cbsz
- i32 3, ; blgp
+; fp6 x fp8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 1, ; cbsz
- i32 3, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 0, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; bf8 x fp4
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 1, ; cbsz
- i32 4, ; blgp
+; fp6 x bf8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 1, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 1, ; cbsz
- i32 4, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 1, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; fp6 x fp8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: scratch_load_dword v14, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: scratch_load_dword v14, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+; fp6 x fp6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
- i32 0, ; blgp
+ i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
- i32 0, ; blgp
+ i32 2, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; fp6 x bf8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: scratch_load_dword v14, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 blgp:1
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: scratch_load_dword v14, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 blgp:1
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+; fp6 x bf6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
- i32 1, ; blgp
+ i32 3, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
- i32 1, ; blgp
+ i32 3, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; fp6 x fp6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 2, ; blgp
+
+; bf6 x fp8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 2, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 0, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; fp6 x bf6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 3, ; blgp
+; bf6 x bf8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 1, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 3, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 3, ; cbsz
+ i32 1, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-
-; bf6 x fp8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: scratch_load_dword v14, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: scratch_load_dword v14, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+; bf6 x fp6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
- i32 0, ; blgp
+ i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
- i32 0, ; blgp
+ i32 2, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; bf6 x bf8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: scratch_load_dword v14, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 blgp:1
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: scratch_load_dword v14, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 blgp:1
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+; bf6 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v24
+; GCN-NEXT: v_accvgpr_write_b32 a15, v25
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
- i32 1, ; blgp
+ i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v24
+; GCN-NEXT: v_accvgpr_write_b32 a15, v25
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
- i32 1, ; blgp
+ i32 4, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; bf6 x fp6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; bf6 x bf6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
- i32 2, ; blgp
+ i32 3, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
- i32 2, ; blgp
+ i32 3, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; bf6 x fp4
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v25
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; fp6 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v24
+; GCN-NEXT: v_accvgpr_write_b32 a15, v25
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 3, ; cbsz
+ i32 2, ; cbsz
i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v25
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v24
+; GCN-NEXT: v_accvgpr_write_b32 a15, v25
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 3, ; cbsz
+ i32 2, ; cbsz
i32 4, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; bf6 x bf6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 3, ; cbsz
- i32 3, ; blgp
+; fp4 x fp8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 3, ; cbsz
- i32 3, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 0, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; fp6 x fp4
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v25
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 4, ; blgp
+; fp4 x bf8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 1, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v25
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 4, ; blgp
- i32 0, i32 0, i32 0, i32 0)
- ret <16 x float> %result
-}
-
-; fp4 x fp8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
- i32 0, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
+ i32 1, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+; fp4 x fp6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v24
+; GCN-NEXT: v_accvgpr_write_b32 a15, v25
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
- i32 0, ; blgp
+ i32 2, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v24
+; GCN-NEXT: v_accvgpr_write_b32 a15, v25
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 2, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; fp4 x bf8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+; fp4 x bf6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v24
+; GCN-NEXT: v_accvgpr_write_b32 a15, v25
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
- i32 1, ; blgp
+ i32 3, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v27
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v10
+; GCN-NEXT: v_accvgpr_write_b32 a1, v11
+; GCN-NEXT: v_accvgpr_write_b32 a2, v12
+; GCN-NEXT: v_accvgpr_write_b32 a3, v13
+; GCN-NEXT: v_accvgpr_write_b32 a4, v14
+; GCN-NEXT: v_accvgpr_write_b32 a5, v15
+; GCN-NEXT: v_accvgpr_write_b32 a6, v16
+; GCN-NEXT: v_accvgpr_write_b32 a7, v17
+; GCN-NEXT: v_accvgpr_write_b32 a8, v18
+; GCN-NEXT: v_accvgpr_write_b32 a9, v19
+; GCN-NEXT: v_accvgpr_write_b32 a10, v20
+; GCN-NEXT: v_accvgpr_write_b32 a11, v21
+; GCN-NEXT: v_accvgpr_write_b32 a12, v22
+; GCN-NEXT: v_accvgpr_write_b32 a13, v23
+; GCN-NEXT: v_accvgpr_write_b32 a14, v24
+; GCN-NEXT: v_accvgpr_write_b32 a15, v25
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
- i32 1, ; blgp
+ i32 3, ; blgp
i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-; fp4 x fp6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v25
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+; fp4 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a4, v12
+; GCN-NEXT: v_accvgpr_write_b32 a5, v13
+; GCN-NEXT: v_accvgpr_write_b32 a6, v14
+; GCN-NEXT: v_accvgpr_write_b32 a7, v15
+; GCN-NEXT: v_accvgpr_write_b32 a8, v16
+; GCN-NEXT: v_accvgpr_write_b32 a9, v17
+; GCN-NEXT: v_accvgpr_write_b32 a10, v18
+; GCN-NEXT: v_accvgpr_write_b32 a11, v19
+; GCN-NEXT: v_accvgpr_write_b32 a12, v20
+; GCN-NEXT: v_accvgpr_write_b32 a13, v21
+; GCN-NEXT: v_accvgpr_write_b32 a14, v22
+; GCN-NEXT: v_accvgpr_write_b32 a15, v23
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
- i32 2, ; blgp
+ i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, v8
+; GCN-NEXT: v_accvgpr_write_b32 a1, v9
+; GCN-NEXT: v_accvgpr_write_b32 a2, v10
+; GCN-NEXT: v_accvgpr_write_b32 a3, v11
+; GCN-NEXT: v_accvgpr_write_b32 a4, v12
+; GCN-NEXT: v_accvgpr_write_b32 a5, v13
+; GCN-NEXT: v_accvgpr_write_b32 a6, v14
+; GCN-NEXT: v_accvgpr_write_b32 a7, v15
+; GCN-NEXT: v_accvgpr_write_b32 a8, v16
+; GCN-NEXT: v_accvgpr_write_b32 a9, v17
+; GCN-NEXT: v_accvgpr_write_b32 a10, v18
+; GCN-NEXT: v_accvgpr_write_b32 a11, v19
+; GCN-NEXT: v_accvgpr_write_b32 a12, v20
+; GCN-NEXT: v_accvgpr_write_b32 a13, v21
+; GCN-NEXT: v_accvgpr_write_b32 a14, v22
+; GCN-NEXT: v_accvgpr_write_b32 a15, v23
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+; --------------------------------------------------------------------
+; Different input parameter classes
+; --------------------------------------------------------------------
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: v_mov_b32_e32 v17, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v31 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v16 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v30, s16
+; SDAG-NEXT: v_mov_b32_e32 v31, s17
+; SDAG-NEXT: v_mov_b32_e32 v32, s18
+; SDAG-NEXT: v_mov_b32_e32 v33, s19
+; SDAG-NEXT: v_mov_b32_e32 v16, s28
+; SDAG-NEXT: v_mov_b32_e32 v17, s29
+; SDAG-NEXT: v_mov_b32_e32 v18, s20
+; SDAG-NEXT: v_mov_b32_e32 v19, s21
+; SDAG-NEXT: v_mov_b32_e32 v20, s22
+; SDAG-NEXT: v_mov_b32_e32 v21, s23
+; SDAG-NEXT: v_mov_b32_e32 v22, s24
+; SDAG-NEXT: v_mov_b32_e32 v23, s25
+; SDAG-NEXT: v_mov_b32_e32 v24, s26
+; SDAG-NEXT: v_mov_b32_e32 v25, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v2
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v3
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v4
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v5
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v6
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v7
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v8
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2
-; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[18:25], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -4787,28 +3503,43 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v25
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
+; GISEL-NEXT: v_mov_b32_e32 v32, s28
+; GISEL-NEXT: v_mov_b32_e32 v33, s29
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[20:21]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v32
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v33
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v0
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v1
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v2
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v3
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v4
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v5
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v6
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v7
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v13
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2
-; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -4826,37 +3557,43 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 2, ; blgp
- i32 0, i32 0, i32 0, i32 0)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-; fp4 x bf6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v30, s16
+; SDAG-NEXT: v_mov_b32_e32 v31, s17
+; SDAG-NEXT: v_mov_b32_e32 v32, s18
+; SDAG-NEXT: v_mov_b32_e32 v33, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3
-; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -4875,115 +3612,38 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v25
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 3, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v10
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v25
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, s20
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3
-; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -5001,37 +3661,43 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__cons
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 3, ; blgp
- i32 0, i32 0, i32 0, i32 0)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-; fp4 x fp4
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v30, s16
+; SDAG-NEXT: v_mov_b32_e32 v31, s17
+; SDAG-NEXT: v_mov_b32_e32 v32, s18
+; SDAG-NEXT: v_mov_b32_e32 v33, s19
; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -5050,9 +3716,17 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
@@ -5069,9 +3743,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x
; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, s20
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -5089,36 +3765,43 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 4, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v30, s16
+; SDAG-NEXT: v_mov_b32_e32 v31, s17
+; SDAG-NEXT: v_mov_b32_e32 v32, s18
+; SDAG-NEXT: v_mov_b32_e32 v33, s19
; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4
-; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -5137,9 +3820,17 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
@@ -5156,9 +3847,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons
; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, s20
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4
-; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -5176,55 +3869,109 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 4, ; blgp
- i32 0, i32 0, i32 0, i32 0)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-; --------------------------------------------------------------------
-; Different input parameter classes
-; --------------------------------------------------------------------
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_accvgpr_write_b32 a0, s0
+; GCN-NEXT: v_accvgpr_write_b32 a1, s1
+; GCN-NEXT: v_accvgpr_write_b32 a2, s2
+; GCN-NEXT: v_accvgpr_write_b32 a3, s3
+; GCN-NEXT: v_accvgpr_write_b32 a4, s16
+; GCN-NEXT: v_accvgpr_write_b32 a5, s17
+; GCN-NEXT: v_accvgpr_write_b32 a6, s18
+; GCN-NEXT: v_accvgpr_write_b32 a7, s19
+; GCN-NEXT: v_accvgpr_write_b32 a8, s20
+; GCN-NEXT: v_accvgpr_write_b32 a9, s21
+; GCN-NEXT: v_accvgpr_write_b32 a10, s22
+; GCN-NEXT: v_accvgpr_write_b32 a11, s23
+; GCN-NEXT: v_accvgpr_write_b32 a12, s24
+; GCN-NEXT: v_accvgpr_write_b32 a13, s25
+; GCN-NEXT: v_accvgpr_write_b32 a14, s26
+; GCN-NEXT: v_accvgpr_write_b32 a15, s27
+; GCN-NEXT: v_mov_b32_e32 v17, s28
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v30, s16
+; SDAG-NEXT: v_mov_b32_e32 v31, s17
+; SDAG-NEXT: v_mov_b32_e32 v32, s18
+; SDAG-NEXT: v_mov_b32_e32 v33, s19
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b32_e32 v20, s24
+; SDAG-NEXT: v_mov_b32_e32 v21, s25
+; SDAG-NEXT: v_mov_b32_e32 v22, s26
+; SDAG-NEXT: v_mov_b32_e32 v23, s27
+; SDAG-NEXT: v_mov_b32_e32 v24, s28
+; SDAG-NEXT: v_mov_b32_e32 v25, s29
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, s0
-; SDAG-NEXT: v_mov_b32_e32 v17, s1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v8
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v10
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v11
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
@@ -5232,30 +3979,45 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: v_mov_b32_e32 v16, s0
-; GISEL-NEXT: v_mov_b32_e32 v17, s1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
+; GISEL-NEXT: v_mov_b32_e32 v24, s20
+; GISEL-NEXT: v_mov_b32_e32 v25, s21
+; GISEL-NEXT: v_mov_b32_e32 v26, s22
+; GISEL-NEXT: v_mov_b32_e32 v27, s23
+; GISEL-NEXT: v_mov_b32_e32 v28, s24
+; GISEL-NEXT: v_mov_b32_e32 v29, s25
+; GISEL-NEXT: v_mov_b32_e32 v30, s26
+; GISEL-NEXT: v_mov_b32_e32 v31, s27
+; GISEL-NEXT: v_mov_b32_e32 v32, s28
+; GISEL-NEXT: v_mov_b32_e32 v33, s29
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v30
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v31
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v32
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v33
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v8
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v9
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v10
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v11
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v12
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v13
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5279,31 +4041,31 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_mov_b32_e32 v31, -2
+; SDAG-NEXT: v_mov_b32_e32 v32, 33
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_mov_b32_e32 v17, s0
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5324,12 +4086,13 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: v_mov_b32_e32 v31, 33
+; GISEL-NEXT: v_mov_b32_e32 v32, -2
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
@@ -5344,10 +4107,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_
; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: v_mov_b32_e32 v17, s0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5367,35 +4129,35 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_mov_b32_e32 v31, -2
+; SDAG-NEXT: v_mov_b32_e32 v32, 0x41
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_mov_b32_e32 v17, s0
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5416,12 +4178,13 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
+; GISEL-NEXT: v_mov_b32_e32 v31, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v32, -2
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
@@ -5436,10 +4199,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_
; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: v_mov_b32_e32 v17, s0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5459,48 +4221,35 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v16, s0
-; SDAG-NEXT: v_mov_b32_e32 v17, s1
-; SDAG-NEXT: v_mov_b32_e32 v18, s2
-; SDAG-NEXT: v_mov_b32_e32 v19, s3
-; SDAG-NEXT: v_mov_b32_e32 v20, s16
-; SDAG-NEXT: v_mov_b32_e32 v21, s17
-; SDAG-NEXT: v_mov_b32_e32 v22, s18
-; SDAG-NEXT: v_mov_b32_e32 v23, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v7
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v6
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v5
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v4
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v3
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s28
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s29
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v31, 1.0
+; SDAG-NEXT: v_mov_b32_e32 v32, 0x41
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5521,39 +4270,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s12, s0
-; GISEL-NEXT: s_mov_b32 s13, s1
-; GISEL-NEXT: s_mov_b32 s14, s2
-; GISEL-NEXT: s_mov_b32 s15, s3
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v0
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v1
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v2
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v3
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v4
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v5
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v6
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v7
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s28
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s29
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[26:27]
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v31, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v32, 1.0
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5573,91 +4313,79 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 1065353216)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_mov_b32_e32 v10, s0
-; SDAG-NEXT: v_mov_b32_e32 v11, s1
-; SDAG-NEXT: v_mov_b32_e32 v12, s2
-; SDAG-NEXT: v_mov_b32_e32 v13, s3
-; SDAG-NEXT: v_mov_b32_e32 v14, s16
-; SDAG-NEXT: v_mov_b32_e32 v15, s17
-; SDAG-NEXT: v_mov_b32_e32 v16, s18
-; SDAG-NEXT: v_mov_b32_e32 v17, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[10:17], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v31, -2
+; SDAG-NEXT: v_mov_b32_e32 v32, 1.0
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s12, s0
-; GISEL-NEXT: s_mov_b32 s13, s1
-; GISEL-NEXT: s_mov_b32 s14, s2
-; GISEL-NEXT: s_mov_b32 s15, s3
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
-; GISEL-NEXT: v_mov_b32_e32 v8, s20
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[10:17], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0]
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v31, 1.0
+; GISEL-NEXT: v_mov_b32_e32 v32, -2
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5677,41 +4405,35 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_mov_b32_e32 v10, s0
-; SDAG-NEXT: v_mov_b32_e32 v11, s1
-; SDAG-NEXT: v_mov_b32_e32 v12, s2
-; SDAG-NEXT: v_mov_b32_e32 v13, s3
-; SDAG-NEXT: v_mov_b32_e32 v14, s16
-; SDAG-NEXT: v_mov_b32_e32 v15, s17
-; SDAG-NEXT: v_mov_b32_e32 v16, s18
-; SDAG-NEXT: v_mov_b32_e32 v17, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[10:17], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v31, 1.0
+; SDAG-NEXT: v_mov_b32_e32 v32, 0.15915494
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5732,36 +4454,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s12, s0
-; GISEL-NEXT: s_mov_b32 s13, s1
-; GISEL-NEXT: s_mov_b32 s14, s2
-; GISEL-NEXT: s_mov_b32 s15, s3
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
-; GISEL-NEXT: v_mov_b32_e32 v8, s20
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[10:17], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v31, 0.15915494
+; GISEL-NEXT: v_mov_b32_e32 v32, 1.0
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5781,41 +4497,35 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> inreg %arg1, <16 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: v_mov_b32_e32 v10, s0
-; SDAG-NEXT: v_mov_b32_e32 v11, s1
-; SDAG-NEXT: v_mov_b32_e32 v12, s2
-; SDAG-NEXT: v_mov_b32_e32 v13, s3
-; SDAG-NEXT: v_mov_b32_e32 v14, s16
-; SDAG-NEXT: v_mov_b32_e32 v15, s17
-; SDAG-NEXT: v_mov_b32_e32 v16, s18
-; SDAG-NEXT: v_mov_b32_e32 v17, s19
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[10:17], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v31, 0x4d
+; SDAG-NEXT: v_mov_b32_e32 v32, 0x41
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5836,36 +4546,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s12, s0
-; GISEL-NEXT: s_mov_b32 s13, s1
-; GISEL-NEXT: s_mov_b32 s14, s2
-; GISEL-NEXT: s_mov_b32 s15, s3
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v8, s20
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[10:17], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v31, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v32, 0x4d
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -5885,2016 +4589,854 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
-; GCN-NEXT: v_accvgpr_write_b32 a4, s16
-; GCN-NEXT: v_accvgpr_write_b32 a5, s17
-; GCN-NEXT: v_accvgpr_write_b32 a6, s18
-; GCN-NEXT: v_accvgpr_write_b32 a7, s19
-; GCN-NEXT: v_accvgpr_write_b32 a8, s20
-; GCN-NEXT: v_accvgpr_write_b32 a9, s21
-; GCN-NEXT: v_accvgpr_write_b32 a10, s22
-; GCN-NEXT: v_accvgpr_write_b32 a11, s23
-; GCN-NEXT: v_accvgpr_write_b32 a12, s24
-; GCN-NEXT: v_accvgpr_write_b32 a13, s25
-; GCN-NEXT: v_accvgpr_write_b32 a14, s26
-; GCN-NEXT: v_accvgpr_write_b32 a15, s27
-; GCN-NEXT: v_mov_b32_e32 v17, s28
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v16, s0
-; SDAG-NEXT: v_mov_b32_e32 v17, s1
-; SDAG-NEXT: v_mov_b32_e32 v18, s2
-; SDAG-NEXT: v_mov_b32_e32 v19, s3
-; SDAG-NEXT: v_mov_b32_e32 v20, s16
-; SDAG-NEXT: v_mov_b32_e32 v21, s17
-; SDAG-NEXT: v_mov_b32_e32 v22, s18
-; SDAG-NEXT: v_mov_b32_e32 v23, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s24
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s25
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s26
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s28
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s29
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: v_mov_b32_e32 v20, s12
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
+; SDAG-NEXT: v_mov_b32_e32 v22, s14
+; SDAG-NEXT: v_mov_b32_e32 v23, s15
+; SDAG-NEXT: v_mov_b32_e32 v24, s16
+; SDAG-NEXT: v_mov_b32_e32 v25, s17
+; SDAG-NEXT: v_mov_b32_e32 v26, s18
+; SDAG-NEXT: v_mov_b32_e32 v27, s19
+; SDAG-NEXT: v_mov_b32_e32 v28, s20
+; SDAG-NEXT: v_mov_b32_e32 v29, s21
+; SDAG-NEXT: v_mov_b32_e32 v30, s22
+; SDAG-NEXT: v_mov_b32_e32 v31, s23
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; SDAG-NEXT: v_mov_b32_e32 v32, s0
+; SDAG-NEXT: v_mov_b32_e32 v33, s1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
+; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
+; SDAG-NEXT: s_endpgm
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s12, s0
-; GISEL-NEXT: s_mov_b32 s13, s1
-; GISEL-NEXT: s_mov_b32 s14, s2
-; GISEL-NEXT: s_mov_b32 s15, s3
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s23
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s24
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s25
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s26
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s27
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s28
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s29
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_mov_b32_e32 v32, s0
+; GISEL-NEXT: v_mov_b32_e32 v33, s1
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
+; GISEL-NEXT: s_nop 2
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
+ store <16 x float> %result, ptr addrspace(1) %ptr, align 64
+ ret void
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, ptr addrspace(1) %ptr) #0 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, -2
-; SDAG-NEXT: v_mov_b32_e32 v17, 33
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
+; SDAG-NEXT: v_mov_b32_e32 v32, -2
+; SDAG-NEXT: v_mov_b32_e32 v33, 0x41
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: v_mov_b32_e32 v20, s12
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
+; SDAG-NEXT: v_mov_b32_e32 v22, s14
+; SDAG-NEXT: v_mov_b32_e32 v23, s15
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; SDAG-NEXT: v_mov_b32_e32 v24, s16
+; SDAG-NEXT: v_mov_b32_e32 v25, s17
+; SDAG-NEXT: v_mov_b32_e32 v26, s18
+; SDAG-NEXT: v_mov_b32_e32 v27, s19
+; SDAG-NEXT: v_mov_b32_e32 v28, s20
+; SDAG-NEXT: v_mov_b32_e32 v29, s21
+; SDAG-NEXT: v_mov_b32_e32 v30, s22
+; SDAG-NEXT: v_mov_b32_e32 v31, s23
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: v_mov_b32_e32 v16, 33
-; GISEL-NEXT: v_mov_b32_e32 v17, -2
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0]
+; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
+; GISEL-NEXT: v_mov_b32_e32 v32, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v33, -2
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
- ret <16 x float> %result
+; GISEL-NEXT: s_nop 2
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2)
+ store <16 x float> %result, ptr addrspace(1) %ptr, align 64
+ ret void
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) #1 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, -2
-; SDAG-NEXT: v_mov_b32_e32 v17, 0x41
+; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s16
+; SDAG-NEXT: v_mov_b32_e32 v7, s17
+; SDAG-NEXT: v_mov_b32_e32 v8, s18
+; SDAG-NEXT: v_mov_b32_e32 v9, s19
+; SDAG-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-NEXT: v_mov_b32_e32 v11, s21
+; SDAG-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-NEXT: v_mov_b32_e32 v13, s23
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
+; SDAG-NEXT: v_mov_b32_e32 v14, s24
+; SDAG-NEXT: v_mov_b32_e32 v15, s25
+; SDAG-NEXT: v_mov_b32_e32 v16, s26
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v0, v1 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
-; GISEL-NEXT: v_mov_b32_e32 v17, -2
-; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b32_e32 v20, s0
+; GISEL-NEXT: v_mov_b32_e32 v21, s1
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, 1.0
-; SDAG-NEXT: v_mov_b32_e32 v17, 0x41
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
-; GISEL-NEXT: v_mov_b32_e32 v17, 1.0
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 1065353216)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, -2
-; SDAG-NEXT: v_mov_b32_e32 v17, 1.0
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: v_mov_b32_e32 v16, 1.0
-; GISEL-NEXT: v_mov_b32_e32 v17, -2
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, 1.0
-; SDAG-NEXT: v_mov_b32_e32 v17, 0.15915494
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: v_mov_b32_e32 v16, 0.15915494
-; GISEL-NEXT: v_mov_b32_e32 v17, 1.0
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d
-; SDAG-NEXT: v_mov_b32_e32 v17, 0x41
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[1,1,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
-; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[1,1,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77)
- ret <16 x float> %result
-}
-
-define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1, ptr addrspace(1) %ptr) #0 {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
-; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
-; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; SDAG-NEXT: v_mov_b32_e32 v16, s8
-; SDAG-NEXT: v_mov_b32_e32 v17, s9
-; SDAG-NEXT: v_mov_b32_e32 v18, s10
-; SDAG-NEXT: v_mov_b32_e32 v19, s11
-; SDAG-NEXT: v_mov_b32_e32 v20, s12
-; SDAG-NEXT: v_mov_b32_e32 v21, s13
-; SDAG-NEXT: v_mov_b32_e32 v22, s14
-; SDAG-NEXT: v_mov_b32_e32 v23, s15
-; SDAG-NEXT: v_mov_b32_e32 v24, s16
-; SDAG-NEXT: v_mov_b32_e32 v25, s17
-; SDAG-NEXT: v_mov_b32_e32 v26, s18
-; SDAG-NEXT: v_mov_b32_e32 v27, s19
-; SDAG-NEXT: v_mov_b32_e32 v28, s20
-; SDAG-NEXT: v_mov_b32_e32 v29, s21
-; SDAG-NEXT: v_mov_b32_e32 v30, s22
-; SDAG-NEXT: v_mov_b32_e32 v31, s23
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; SDAG-NEXT: v_mov_b32_e32 v32, s0
-; SDAG-NEXT: v_mov_b32_e32 v33, s1
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
-; SDAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
-; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT: v_mov_b32_e32 v32, s0
-; GISEL-NEXT: v_mov_b32_e32 v33, s1
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
-; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
-; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
-; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
-; GISEL-NEXT: s_endpgm
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
- store <16 x float> %result, ptr addrspace(1) %ptr, align 64
- ret void
-}
-
-define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, ptr addrspace(1) %ptr) #0 {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
-; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v32, -2
-; SDAG-NEXT: v_mov_b32_e32 v33, 0x41
-; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v16, s8
-; SDAG-NEXT: v_mov_b32_e32 v17, s9
-; SDAG-NEXT: v_mov_b32_e32 v18, s10
-; SDAG-NEXT: v_mov_b32_e32 v19, s11
-; SDAG-NEXT: v_mov_b32_e32 v20, s12
-; SDAG-NEXT: v_mov_b32_e32 v21, s13
-; SDAG-NEXT: v_mov_b32_e32 v22, s14
-; SDAG-NEXT: v_mov_b32_e32 v23, s15
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; SDAG-NEXT: v_mov_b32_e32 v24, s16
-; SDAG-NEXT: v_mov_b32_e32 v25, s17
-; SDAG-NEXT: v_mov_b32_e32 v26, s18
-; SDAG-NEXT: v_mov_b32_e32 v27, s19
-; SDAG-NEXT: v_mov_b32_e32 v28, s20
-; SDAG-NEXT: v_mov_b32_e32 v29, s21
-; SDAG-NEXT: v_mov_b32_e32 v30, s22
-; SDAG-NEXT: v_mov_b32_e32 v31, s23
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; SDAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
-; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v32, 0x41
-; GISEL-NEXT: v_mov_b32_e32 v33, -2
-; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
-; GISEL-NEXT: s_endpgm
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2)
- store <16 x float> %result, ptr addrspace(1) %ptr, align 64
- ret void
-}
-
-define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) #1 {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s12
-; SDAG-NEXT: v_mov_b32_e32 v3, s13
-; SDAG-NEXT: v_mov_b32_e32 v4, s14
-; SDAG-NEXT: v_mov_b32_e32 v5, s15
-; SDAG-NEXT: v_mov_b32_e32 v6, s16
-; SDAG-NEXT: v_mov_b32_e32 v7, s17
-; SDAG-NEXT: v_mov_b32_e32 v8, s18
-; SDAG-NEXT: v_mov_b32_e32 v9, s19
-; SDAG-NEXT: v_mov_b32_e32 v10, s20
-; SDAG-NEXT: v_mov_b32_e32 v11, s21
-; SDAG-NEXT: v_mov_b32_e32 v12, s22
-; SDAG-NEXT: v_mov_b32_e32 v13, s23
-; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
-; SDAG-NEXT: v_mov_b32_e32 v14, s24
-; SDAG-NEXT: v_mov_b32_e32 v15, s25
-; SDAG-NEXT: v_mov_b32_e32 v16, s26
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v17, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v0, v1 op_sel_hi:[0,0,0]
-; SDAG-NEXT: v_mov_b32_e32 v2, s20
-; SDAG-NEXT: v_mov_b32_e32 v3, s21
-; SDAG-NEXT: v_mov_b32_e32 v4, s22
-; SDAG-NEXT: v_mov_b32_e32 v5, s23
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
-; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
-; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s14
-; SDAG-NEXT: v_mov_b32_e32 v9, s15
-; SDAG-NEXT: v_mov_b32_e32 v6, s12
-; SDAG-NEXT: v_mov_b32_e32 v7, s13
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v10, s10
-; SDAG-NEXT: v_mov_b32_e32 v11, s11
-; SDAG-NEXT: v_mov_b32_e32 v8, s8
-; SDAG-NEXT: v_mov_b32_e32 v9, s9
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
-; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[44:45]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[50:51]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b32_e32 v4, s0
-; GISEL-NEXT: v_mov_b32_e32 v5, s1
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[6:13], v[14:21], a[0:15], v4, v5 op_sel_hi:[0,0,0]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[2:3], v[12:15], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[6:7], v[20:23], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: global_store_dwordx4 v[0:1], a[0:3], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[2:3], a[4:7], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[4:5], a[8:11], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[6:7], a[12:15], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_endpgm
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
- store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
- store volatile <16 x float> %result, ptr addrspace(1) null, align 64
- ret void
-}
-
-define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #1 {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
-; SDAG-NEXT: v_mov_b32_e32 v0, 42
-; SDAG-NEXT: v_mov_b32_e32 v1, 25
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s12
-; SDAG-NEXT: v_mov_b32_e32 v3, s13
-; SDAG-NEXT: v_mov_b32_e32 v4, s14
-; SDAG-NEXT: v_mov_b32_e32 v5, s15
-; SDAG-NEXT: v_mov_b32_e32 v6, s16
-; SDAG-NEXT: v_mov_b32_e32 v7, s17
-; SDAG-NEXT: v_mov_b32_e32 v8, s18
-; SDAG-NEXT: v_mov_b32_e32 v9, s19
-; SDAG-NEXT: v_mov_b32_e32 v10, s20
-; SDAG-NEXT: v_mov_b32_e32 v11, s21
-; SDAG-NEXT: v_mov_b32_e32 v12, s22
-; SDAG-NEXT: v_mov_b32_e32 v13, s23
-; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v14, s24
-; SDAG-NEXT: v_mov_b32_e32 v15, s25
-; SDAG-NEXT: v_mov_b32_e32 v16, s26
-; SDAG-NEXT: v_mov_b32_e32 v17, s27
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v1, v0 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v2, s20
-; SDAG-NEXT: v_mov_b32_e32 v3, s21
-; SDAG-NEXT: v_mov_b32_e32 v4, s22
-; SDAG-NEXT: v_mov_b32_e32 v5, s23
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
-; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
-; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s14
-; SDAG-NEXT: v_mov_b32_e32 v9, s15
-; SDAG-NEXT: v_mov_b32_e32 v6, s12
-; SDAG-NEXT: v_mov_b32_e32 v7, s13
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v10, s10
-; SDAG-NEXT: v_mov_b32_e32 v11, s11
-; SDAG-NEXT: v_mov_b32_e32 v8, s8
-; SDAG-NEXT: v_mov_b32_e32 v9, s9
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
-; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v4, 25
-; GISEL-NEXT: v_mov_b32_e32 v5, 42
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], 0
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[44:45]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[50:51]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], 16
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[6:13], v[14:21], a[0:15], v4, v5 op_sel_hi:[0,0,0] blgp:2
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], 32
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], 48
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[2:3], v[12:15], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[4:5], v[16:19], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[6:7], v[20:23], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: global_store_dwordx4 v[0:1], a[0:3], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[2:3], a[4:7], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[4:5], a[8:11], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[6:7], a[12:15], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_endpgm
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
- store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
- store volatile <16 x float> %result, ptr addrspace(1) null, align 64
- ret void
-}
-
-define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v32, s12
-; SDAG-NEXT: v_mov_b32_e32 v33, s13
-; SDAG-NEXT: v_mov_b32_e32 v34, s14
-; SDAG-NEXT: v_mov_b32_e32 v35, s15
-; SDAG-NEXT: v_mov_b32_e32 v36, s16
-; SDAG-NEXT: v_mov_b32_e32 v37, s17
-; SDAG-NEXT: v_mov_b32_e32 v38, s18
-; SDAG-NEXT: v_mov_b32_e32 v39, s19
-; SDAG-NEXT: v_mov_b32_e32 v40, s20
-; SDAG-NEXT: v_mov_b32_e32 v41, s21
-; SDAG-NEXT: v_mov_b32_e32 v42, s22
-; SDAG-NEXT: v_mov_b32_e32 v43, s23
-; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v44, s24
-; SDAG-NEXT: v_mov_b32_e32 v45, s25
-; SDAG-NEXT: v_mov_b32_e32 v46, s26
-; SDAG-NEXT: v_mov_b32_e32 v47, s27
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; SDAG-NEXT: s_nop 14
-; SDAG-NEXT: v_mov_b32_e32 v16, s20
-; SDAG-NEXT: v_mov_b32_e32 v17, s21
-; SDAG-NEXT: v_mov_b32_e32 v18, s22
-; SDAG-NEXT: v_mov_b32_e32 v19, s23
-; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
-; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
-; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
-; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mov_b32_e32 v18, s18
-; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
-; SDAG-NEXT: v_mov_b32_e32 v16, s12
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
-; SDAG-NEXT: v_mov_b32_e32 v18, s14
-; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v16, s8
-; SDAG-NEXT: v_mov_b32_e32 v17, s9
-; SDAG-NEXT: v_mov_b32_e32 v18, s10
-; SDAG-NEXT: v_mov_b32_e32 v19, s11
-; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
-; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
-; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
-; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
-; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 7
-; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_endpgm
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
- store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
- store volatile <16 x float> %result, ptr addrspace(1) null, align 64
- ret void
-}
-
-define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
-; SDAG-NEXT: v_mov_b32_e32 v32, 42
-; SDAG-NEXT: v_mov_b32_e32 v33, 25
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v16, s12
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
-; SDAG-NEXT: v_mov_b32_e32 v18, s14
-; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: v_mov_b32_e32 v20, s16
-; SDAG-NEXT: v_mov_b32_e32 v21, s17
-; SDAG-NEXT: v_mov_b32_e32 v22, s18
-; SDAG-NEXT: v_mov_b32_e32 v23, s19
-; SDAG-NEXT: v_mov_b32_e32 v24, s20
-; SDAG-NEXT: v_mov_b32_e32 v25, s21
-; SDAG-NEXT: v_mov_b32_e32 v26, s22
-; SDAG-NEXT: v_mov_b32_e32 v27, s23
-; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v28, s24
-; SDAG-NEXT: v_mov_b32_e32 v29, s25
-; SDAG-NEXT: v_mov_b32_e32 v30, s26
-; SDAG-NEXT: v_mov_b32_e32 v31, s27
-; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v16, s20
-; SDAG-NEXT: v_mov_b32_e32 v17, s21
-; SDAG-NEXT: v_mov_b32_e32 v18, s22
-; SDAG-NEXT: v_mov_b32_e32 v19, s23
-; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
-; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
-; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
-; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mov_b32_e32 v18, s18
-; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
-; SDAG-NEXT: v_mov_b32_e32 v16, s12
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
-; SDAG-NEXT: v_mov_b32_e32 v18, s14
-; SDAG-NEXT: v_mov_b32_e32 v19, s15
-; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v16, s8
-; SDAG-NEXT: v_mov_b32_e32 v17, s9
-; SDAG-NEXT: v_mov_b32_e32 v18, s10
-; SDAG-NEXT: v_mov_b32_e32 v19, s11
-; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
-; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v32, 25
-; GISEL-NEXT: v_mov_b32_e32 v33, 42
-; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
-; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
-; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
-; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_endpgm
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
- store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
- store volatile <16 x float> %result, ptr addrspace(1) null, align 64
- ret void
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, 1
-; SDAG-NEXT: v_mov_b32_e32 v17, 0
-; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-NEXT: v_mov_b32_e32 v17, 1
+; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
- ret <16 x float> %result
+; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 2
+; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
+ store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
+ store volatile <16 x float> %result, ptr addrspace(1) null, align 64
+ ret void
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #1 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: v_mov_b32_e32 v17, 1
+; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT: v_mov_b32_e32 v0, 42
+; SDAG-NEXT: v_mov_b32_e32 v1, 25
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s16
+; SDAG-NEXT: v_mov_b32_e32 v7, s17
+; SDAG-NEXT: v_mov_b32_e32 v8, s18
+; SDAG-NEXT: v_mov_b32_e32 v9, s19
+; SDAG-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-NEXT: v_mov_b32_e32 v11, s21
+; SDAG-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-NEXT: v_mov_b32_e32 v13, s23
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; SDAG-NEXT: v_mov_b32_e32 v14, s24
+; SDAG-NEXT: v_mov_b32_e32 v15, s25
+; SDAG-NEXT: v_mov_b32_e32 v16, s26
+; SDAG-NEXT: v_mov_b32_e32 v17, s27
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v1, v0 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT: v_mov_b32_e32 v16, 1
-; GISEL-NEXT: v_mov_b32_e32 v17, 0
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT: v_mov_b32_e32 v20, 25
+; GISEL-NEXT: v_mov_b32_e32 v21, 42
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
- ret <16 x float> %result
+; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 2
+; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
+ store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
+ store volatile <16 x float> %result, ptr addrspace(1) null, align 64
+ ret void
}
-; --------------------------------------------------------------------
-; Incorrect signature for format cases (IR vector too large)
-; --------------------------------------------------------------------
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6:
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v32, s12
+; SDAG-NEXT: v_mov_b32_e32 v33, s13
+; SDAG-NEXT: v_mov_b32_e32 v34, s14
+; SDAG-NEXT: v_mov_b32_e32 v35, s15
+; SDAG-NEXT: v_mov_b32_e32 v36, s16
+; SDAG-NEXT: v_mov_b32_e32 v37, s17
+; SDAG-NEXT: v_mov_b32_e32 v38, s18
+; SDAG-NEXT: v_mov_b32_e32 v39, s19
+; SDAG-NEXT: v_mov_b32_e32 v40, s20
+; SDAG-NEXT: v_mov_b32_e32 v41, s21
+; SDAG-NEXT: v_mov_b32_e32 v42, s22
+; SDAG-NEXT: v_mov_b32_e32 v43, s23
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; SDAG-NEXT: v_mov_b32_e32 v44, s24
+; SDAG-NEXT: v_mov_b32_e32 v45, s25
+; SDAG-NEXT: v_mov_b32_e32 v46, s26
+; SDAG-NEXT: v_mov_b32_e32 v47, s27
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
+; SDAG-NEXT: s_nop 14
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] blgp:2
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 0, ; cbsz
- i32 2, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
- ret <16 x float> %result
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
+ store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
+ store volatile <16 x float> %result, ptr addrspace(1) null, align 64
+ ret void
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8:
+define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT: v_mov_b32_e32 v32, 42
+; SDAG-NEXT: v_mov_b32_e32 v33, 25
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: v_mov_b32_e32 v20, s16
+; SDAG-NEXT: v_mov_b32_e32 v21, s17
+; SDAG-NEXT: v_mov_b32_e32 v22, s18
+; SDAG-NEXT: v_mov_b32_e32 v23, s19
+; SDAG-NEXT: v_mov_b32_e32 v24, s20
+; SDAG-NEXT: v_mov_b32_e32 v25, s21
+; SDAG-NEXT: v_mov_b32_e32 v26, s22
+; SDAG-NEXT: v_mov_b32_e32 v27, s23
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; SDAG-NEXT: v_mov_b32_e32 v28, s24
+; SDAG-NEXT: v_mov_b32_e32 v29, s25
+; SDAG-NEXT: v_mov_b32_e32 v30, s26
+; SDAG-NEXT: v_mov_b32_e32 v31, s27
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:2
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT: s_setpc_b64 s[30:31]
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT: v_mov_b32_e32 v32, 25
+; GISEL-NEXT: v_mov_b32_e32 v33, 42
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:2
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 0, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 2
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
+ store volatile <16 x float> %arg2, ptr addrspace(1) null, align 64
+ store volatile <16 x float> %result, ptr addrspace(1) null, align 64
+ ret void
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_mov_b32_e32 v31, 1
+; SDAG-NEXT: v_mov_b32_e32 v32, 0
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -7913,14 +5455,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v31, 0
+; GISEL-NEXT: v_mov_b32_e32 v32, 1
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -7936,8 +5478,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -7955,37 +5498,37 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 2, ; blgp
- i32 0, i32 %scale0, i32 0, i32 %scale1)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_mov_b32_e32 v31, 0
+; SDAG-NEXT: v_mov_b32_e32 v32, 1
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2
-; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -8004,10 +5547,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v31, 1
+; GISEL-NEXT: v_mov_b32_e32 v32, 0
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
@@ -8025,8 +5570,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2
-; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -8044,38 +5590,39 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 2, ; cbsz
- i32 2, ; blgp
- i32 0, i32 0, i32 0, i32 0)
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4:
+; --------------------------------------------------------------------
+; Incorrect signature for format cases (IR vector too large)
+; --------------------------------------------------------------------
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:4
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:2
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -8096,14 +5643,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -8119,7 +5666,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] blgp:4
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:2
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -8141,36 +5688,36 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
- i32 4, ; blgp
+ i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:4
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -8191,14 +5738,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -8214,7 +5761,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:4
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -8235,38 +5782,38 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
+ i32 2, ; cbsz
i32 0, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: scratch_load_dword v14, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:4
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -8285,31 +5832,31 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: scratch_load_dword v14, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:4
-; GISEL-NEXT: s_nop 15
-; GISEL-NEXT: s_nop 3
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -8327,37 +5874,86 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
- i32 0, ; cbsz
- i32 4, ; blgp
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 2, ; cbsz
+ i32 2, ; blgp
+ i32 0, i32 0, i32 0, i32 0)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v14
-; SDAG-NEXT: scratch_load_dword v14, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT: scratch_load_dword a15, off, s32
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:4
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:4
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -8378,29 +5974,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v14
-; GISEL-NEXT: scratch_load_dword v14, off, s32
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:4
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:4
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -8420,39 +6017,40 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
- i32 4, ; cbsz
- i32 0, ; blgp
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
+ i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v16, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v17, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v17, v16 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4
+; SDAG-NEXT: s_nop 15
+; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
@@ -8471,14 +6069,14 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v16, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v17, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
@@ -8494,8 +6092,9 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4
+; GISEL-NEXT: s_nop 15
+; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
@@ -8515,34 +6114,136 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+ i32 0, ; cbsz
i32 4, ; blgp
i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v14
+; GCN-NEXT: v_accvgpr_write_b32 a1, v15
+; GCN-NEXT: v_accvgpr_write_b32 a2, v16
+; GCN-NEXT: v_accvgpr_write_b32 a3, v17
+; GCN-NEXT: v_accvgpr_write_b32 a4, v18
+; GCN-NEXT: v_accvgpr_write_b32 a5, v19
+; GCN-NEXT: v_accvgpr_write_b32 a6, v20
+; GCN-NEXT: v_accvgpr_write_b32 a7, v21
+; GCN-NEXT: v_accvgpr_write_b32 a8, v22
+; GCN-NEXT: v_accvgpr_write_b32 a9, v23
+; GCN-NEXT: v_accvgpr_write_b32 a10, v24
+; GCN-NEXT: v_accvgpr_write_b32 a11, v25
+; GCN-NEXT: v_accvgpr_write_b32 a12, v26
+; GCN-NEXT: v_accvgpr_write_b32 a13, v27
+; GCN-NEXT: v_accvgpr_write_b32 a14, v28
+; GCN-NEXT: v_accvgpr_write_b32 a15, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT: s_nop 15
+; GCN-NEXT: s_nop 3
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 0, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 blgp:4
; SDAG-NEXT: s_nop 11
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -8562,10 +6263,12 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_
; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: scratch_load_dword a15, off, s32
+; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
@@ -8583,7 +6286,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_
; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4
; GISEL-NEXT: s_nop 11
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
@@ -8602,6 +6305,54 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_
; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+ i32 4, ; cbsz
+ i32 4, ; blgp
+ i32 0, i32 %scale0, i32 0, i32 %scale1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_dword a15, off, s32
+; GCN-NEXT: v_accvgpr_write_b32 a0, v16
+; GCN-NEXT: v_accvgpr_write_b32 a1, v17
+; GCN-NEXT: v_accvgpr_write_b32 a2, v18
+; GCN-NEXT: v_accvgpr_write_b32 a3, v19
+; GCN-NEXT: v_accvgpr_write_b32 a4, v20
+; GCN-NEXT: v_accvgpr_write_b32 a5, v21
+; GCN-NEXT: v_accvgpr_write_b32 a6, v22
+; GCN-NEXT: v_accvgpr_write_b32 a7, v23
+; GCN-NEXT: v_accvgpr_write_b32 a8, v24
+; GCN-NEXT: v_accvgpr_write_b32 a9, v25
+; GCN-NEXT: v_accvgpr_write_b32 a10, v26
+; GCN-NEXT: v_accvgpr_write_b32 a11, v27
+; GCN-NEXT: v_accvgpr_write_b32 a12, v28
+; GCN-NEXT: v_accvgpr_write_b32 a13, v29
+; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
i32 4, ; blgp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index c2b7e51c43bc8..6eb9449069a52 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -149,19 +149,19 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <
; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
; GISEL-NEXT: v_mov_b32_e32 v0, s24
; GISEL-NEXT: v_mov_b32_e32 v1, s25
; GISEL-NEXT: v_mov_b32_e32 v2, s26
; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-NEXT: v_mov_b32_e32 v16, s28
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v4
+; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
@@ -247,151 +247,168 @@ bb:
}
define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_32x32x32_f16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_32x32x32_f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x32_f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
+; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v26, s0
-; SDAG-NEXT: v_mov_b32_e32 v27, s1
-; SDAG-NEXT: v_mov_b32_e32 v28, s2
-; SDAG-NEXT: v_mov_b32_e32 v29, s3
-; SDAG-NEXT: v_mov_b32_e32 v16, v10
-; SDAG-NEXT: v_mov_b32_e32 v15, v9
-; SDAG-NEXT: v_mov_b32_e32 v14, v8
-; SDAG-NEXT: v_mov_b32_e32 v13, v7
-; SDAG-NEXT: v_mov_b32_e32 v12, v6
-; SDAG-NEXT: v_mov_b32_e32 v11, v5
-; SDAG-NEXT: v_mov_b32_e32 v10, v4
-; SDAG-NEXT: v_mov_b32_e32 v9, v3
-; SDAG-NEXT: v_mov_b32_e32 v8, v2
-; SDAG-NEXT: v_mov_b32_e32 v7, v1
-; SDAG-NEXT: v_mov_b32_e32 v6, v0
-; SDAG-NEXT: v_mov_b32_e32 v0, s24
-; SDAG-NEXT: v_mov_b32_e32 v1, s25
-; SDAG-NEXT: v_mov_b32_e32 v2, s26
-; SDAG-NEXT: v_mov_b32_e32 v3, s27
-; SDAG-NEXT: v_mov_b32_e32 v4, s28
-; SDAG-NEXT: v_mov_b32_e32 v5, s29
-; SDAG-NEXT: v_mov_b32_e32 v18, s16
-; SDAG-NEXT: v_mov_b32_e32 v19, s17
-; SDAG-NEXT: v_mov_b32_e32 v20, s18
-; SDAG-NEXT: v_mov_b32_e32 v21, s19
-; SDAG-NEXT: v_mov_b32_e32 v22, s20
-; SDAG-NEXT: v_mov_b32_e32 v23, s21
-; SDAG-NEXT: v_mov_b32_e32 v24, s22
-; SDAG-NEXT: v_mov_b32_e32 v25, s23
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v16
+; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
+; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v18, v0
-; GISEL-NEXT: v_mov_b32_e32 v19, v1
-; GISEL-NEXT: v_mov_b32_e32 v20, v2
-; GISEL-NEXT: v_mov_b32_e32 v21, v3
-; GISEL-NEXT: v_mov_b32_e32 v22, v4
-; GISEL-NEXT: v_mov_b32_e32 v23, v5
-; GISEL-NEXT: v_mov_b32_e32 v24, v6
-; GISEL-NEXT: v_mov_b32_e32 v25, v7
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v26, v8
-; GISEL-NEXT: v_mov_b32_e32 v27, v9
-; GISEL-NEXT: v_mov_b32_e32 v12, s24
-; GISEL-NEXT: v_mov_b32_e32 v13, s25
-; GISEL-NEXT: v_mov_b32_e32 v14, s26
-; GISEL-NEXT: v_mov_b32_e32 v15, s27
-; GISEL-NEXT: v_mov_b32_e32 v16, s28
-; GISEL-NEXT: v_mov_b32_e32 v17, s29
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23]
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[28:31], v[0:7], v10
-; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
; GISEL-NEXT: v_mov_b32_e32 v0, v12
; GISEL-NEXT: v_mov_b32_e32 v1, v13
; GISEL-NEXT: v_mov_b32_e32 v2, v14
@@ -408,6 +425,104 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
; GISEL-NEXT: v_mov_b32_e32 v13, v25
; GISEL-NEXT: v_mov_b32_e32 v14, v26
; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
+; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v36, s0
+; SDAG-NEXT: v_mov_b32_e32 v37, s1
+; SDAG-NEXT: v_mov_b32_e32 v38, s2
+; SDAG-NEXT: v_mov_b32_e32 v39, s3
+; SDAG-NEXT: v_mov_b32_e32 v13, s25
+; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v16, s28
+; SDAG-NEXT: v_mov_b32_e32 v17, s29
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v29, s17
+; SDAG-NEXT: v_mov_b32_e32 v30, s18
+; SDAG-NEXT: v_mov_b32_e32 v31, s19
+; SDAG-NEXT: v_mov_b32_e32 v32, s20
+; SDAG-NEXT: v_mov_b32_e32 v33, s21
+; SDAG-NEXT: v_mov_b32_e32 v34, s22
+; SDAG-NEXT: v_mov_b32_e32 v35, s23
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v18, v0
+; SDAG-NEXT: v_mov_b32_e32 v19, v1
+; SDAG-NEXT: v_mov_b32_e32 v20, v2
+; SDAG-NEXT: v_mov_b32_e32 v21, v3
+; SDAG-NEXT: v_mov_b32_e32 v22, v4
+; SDAG-NEXT: v_mov_b32_e32 v23, v5
+; SDAG-NEXT: v_mov_b32_e32 v24, v6
+; SDAG-NEXT: v_mov_b32_e32 v25, v7
+; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v11, v0
+; GISEL-NEXT: v_mov_b32_e32 v12, v1
+; GISEL-NEXT: v_mov_b32_e32 v13, v2
+; GISEL-NEXT: v_mov_b32_e32 v14, v3
+; GISEL-NEXT: v_mov_b32_e32 v15, v4
+; GISEL-NEXT: v_mov_b32_e32 v16, v5
+; GISEL-NEXT: v_mov_b32_e32 v17, v6
+; GISEL-NEXT: v_mov_b32_e32 v18, v7
+; GISEL-NEXT: v_mov_b32_e32 v19, v8
+; GISEL-NEXT: v_mov_b32_e32 v20, v9
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-NEXT: v_mov_b32_e32 v21, v10
+; GISEL-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-NEXT: v_mov_b32_e32 v5, s29
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-NEXT: v_mov_b32_e32 v6, v11
+; GISEL-NEXT: v_mov_b32_e32 v7, v12
+; GISEL-NEXT: v_mov_b32_e32 v8, v13
+; GISEL-NEXT: v_mov_b32_e32 v9, v14
+; GISEL-NEXT: v_mov_b32_e32 v10, v15
+; GISEL-NEXT: v_mov_b32_e32 v11, v16
+; GISEL-NEXT: v_mov_b32_e32 v12, v17
+; GISEL-NEXT: v_mov_b32_e32 v13, v18
+; GISEL-NEXT: v_mov_b32_e32 v14, v19
+; GISEL-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
@@ -664,37 +779,53 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v26, s0
-; GCN-NEXT: v_mov_b32_e32 v27, s1
-; GCN-NEXT: v_mov_b32_e32 v28, s2
-; GCN-NEXT: v_mov_b32_e32 v29, s3
-; GCN-NEXT: v_mov_b32_e32 v16, v10
-; GCN-NEXT: v_mov_b32_e32 v15, v9
-; GCN-NEXT: v_mov_b32_e32 v14, v8
-; GCN-NEXT: v_mov_b32_e32 v13, v7
-; GCN-NEXT: v_mov_b32_e32 v12, v6
-; GCN-NEXT: v_mov_b32_e32 v11, v5
-; GCN-NEXT: v_mov_b32_e32 v10, v4
-; GCN-NEXT: v_mov_b32_e32 v9, v3
-; GCN-NEXT: v_mov_b32_e32 v8, v2
-; GCN-NEXT: v_mov_b32_e32 v7, v1
-; GCN-NEXT: v_mov_b32_e32 v6, v0
-; GCN-NEXT: v_mov_b32_e32 v0, s24
-; GCN-NEXT: v_mov_b32_e32 v1, s25
-; GCN-NEXT: v_mov_b32_e32 v2, s26
-; GCN-NEXT: v_mov_b32_e32 v3, s27
-; GCN-NEXT: v_mov_b32_e32 v4, s28
-; GCN-NEXT: v_mov_b32_e32 v5, s29
-; GCN-NEXT: v_mov_b32_e32 v18, s16
-; GCN-NEXT: v_mov_b32_e32 v19, s17
-; GCN-NEXT: v_mov_b32_e32 v20, s18
-; GCN-NEXT: v_mov_b32_e32 v21, s19
-; GCN-NEXT: v_mov_b32_e32 v22, s20
-; GCN-NEXT: v_mov_b32_e32 v23, s21
-; GCN-NEXT: v_mov_b32_e32 v24, s22
-; GCN-NEXT: v_mov_b32_e32 v25, s23
+; GCN-NEXT: v_mov_b32_e32 v36, s0
+; GCN-NEXT: v_mov_b32_e32 v37, s1
+; GCN-NEXT: v_mov_b32_e32 v38, s2
+; GCN-NEXT: v_mov_b32_e32 v39, s3
+; GCN-NEXT: v_mov_b32_e32 v13, s25
+; GCN-NEXT: v_mov_b32_e32 v14, s26
+; GCN-NEXT: v_mov_b32_e32 v15, s27
+; GCN-NEXT: v_mov_b32_e32 v16, s28
+; GCN-NEXT: v_mov_b32_e32 v17, s29
+; GCN-NEXT: v_mov_b32_e32 v28, s16
+; GCN-NEXT: v_mov_b32_e32 v29, s17
+; GCN-NEXT: v_mov_b32_e32 v30, s18
+; GCN-NEXT: v_mov_b32_e32 v31, s19
+; GCN-NEXT: v_mov_b32_e32 v32, s20
+; GCN-NEXT: v_mov_b32_e32 v33, s21
+; GCN-NEXT: v_mov_b32_e32 v34, s22
+; GCN-NEXT: v_mov_b32_e32 v35, s23
+; GCN-NEXT: v_mov_b32_e32 v12, s24
+; GCN-NEXT: v_mov_b32_e32 v18, v0
+; GCN-NEXT: v_mov_b32_e32 v19, v1
+; GCN-NEXT: v_mov_b32_e32 v20, v2
+; GCN-NEXT: v_mov_b32_e32 v21, v3
+; GCN-NEXT: v_mov_b32_e32 v22, v4
+; GCN-NEXT: v_mov_b32_e32 v23, v5
+; GCN-NEXT: v_mov_b32_e32 v24, v6
+; GCN-NEXT: v_mov_b32_e32 v25, v7
+; GCN-NEXT: v_mov_b32_e32 v26, v8
+; GCN-NEXT: v_mov_b32_e32 v27, v9
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v16
+; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10
+; GCN-NEXT: s_nop 11
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
@@ -851,19 +982,19 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
; GISEL-NEXT: v_mov_b32_e32 v0, s24
; GISEL-NEXT: v_mov_b32_e32 v1, s25
; GISEL-NEXT: v_mov_b32_e32 v2, s26
; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-NEXT: v_mov_b32_e32 v16, s28
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[14:17], v[6:13], v4
+; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x i32> %result
@@ -955,151 +1086,44 @@ bb:
}
define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_i32_32x32x64_i8:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
- ret <16 x i32> %result
-}
-
-define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
- ret <16 x i32> %result
-}
-
-define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
- ret <16 x i32> %result
-}
-
-define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x i32> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
+; SDAG-LABEL: test_smfmac_i32_32x32x64_i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v26, s0
-; SDAG-NEXT: v_mov_b32_e32 v27, s1
-; SDAG-NEXT: v_mov_b32_e32 v28, s2
-; SDAG-NEXT: v_mov_b32_e32 v29, s3
-; SDAG-NEXT: v_mov_b32_e32 v16, v10
-; SDAG-NEXT: v_mov_b32_e32 v15, v9
-; SDAG-NEXT: v_mov_b32_e32 v14, v8
-; SDAG-NEXT: v_mov_b32_e32 v13, v7
-; SDAG-NEXT: v_mov_b32_e32 v12, v6
-; SDAG-NEXT: v_mov_b32_e32 v11, v5
-; SDAG-NEXT: v_mov_b32_e32 v10, v4
-; SDAG-NEXT: v_mov_b32_e32 v9, v3
-; SDAG-NEXT: v_mov_b32_e32 v8, v2
-; SDAG-NEXT: v_mov_b32_e32 v7, v1
-; SDAG-NEXT: v_mov_b32_e32 v6, v0
-; SDAG-NEXT: v_mov_b32_e32 v0, s24
-; SDAG-NEXT: v_mov_b32_e32 v1, s25
-; SDAG-NEXT: v_mov_b32_e32 v2, s26
-; SDAG-NEXT: v_mov_b32_e32 v3, s27
-; SDAG-NEXT: v_mov_b32_e32 v4, s28
-; SDAG-NEXT: v_mov_b32_e32 v5, s29
-; SDAG-NEXT: v_mov_b32_e32 v18, s16
-; SDAG-NEXT: v_mov_b32_e32 v19, s17
-; SDAG-NEXT: v_mov_b32_e32 v20, s18
-; SDAG-NEXT: v_mov_b32_e32 v21, s19
-; SDAG-NEXT: v_mov_b32_e32 v22, s20
-; SDAG-NEXT: v_mov_b32_e32 v23, s21
-; SDAG-NEXT: v_mov_b32_e32 v24, s22
-; SDAG-NEXT: v_mov_b32_e32 v25, s23
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16
+; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
+; GISEL-LABEL: test_smfmac_i32_32x32x64_i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v18, v0
-; GISEL-NEXT: v_mov_b32_e32 v19, v1
-; GISEL-NEXT: v_mov_b32_e32 v20, v2
-; GISEL-NEXT: v_mov_b32_e32 v21, v3
-; GISEL-NEXT: v_mov_b32_e32 v22, v4
-; GISEL-NEXT: v_mov_b32_e32 v23, v5
-; GISEL-NEXT: v_mov_b32_e32 v24, v6
-; GISEL-NEXT: v_mov_b32_e32 v25, v7
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v26, v8
-; GISEL-NEXT: v_mov_b32_e32 v27, v9
-; GISEL-NEXT: v_mov_b32_e32 v12, s24
-; GISEL-NEXT: v_mov_b32_e32 v13, s25
-; GISEL-NEXT: v_mov_b32_e32 v14, s26
-; GISEL-NEXT: v_mov_b32_e32 v15, s27
-; GISEL-NEXT: v_mov_b32_e32 v16, s28
-; GISEL-NEXT: v_mov_b32_e32 v17, s29
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23]
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[28:31], v[0:7], v10
-; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
; GISEL-NEXT: v_mov_b32_e32 v0, v12
; GISEL-NEXT: v_mov_b32_e32 v1, v13
; GISEL-NEXT: v_mov_b32_e32 v2, v14
@@ -1116,19 +1140,241 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; GISEL-NEXT: v_mov_b32_e32 v13, v25
; GISEL-NEXT: v_mov_b32_e32 v14, v26
; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x i32> %result
}
-; --------------------------------------------------------------------
-; llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8
-; --------------------------------------------------------------------
-
-declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)
-
-define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
+define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
+; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
+ ret <16 x i32> %result
+}
+
+define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
+; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
+ ret <16 x i32> %result
+}
+
+define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x i32> inreg %arg2, i32 inreg %arg3) {
+; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v36, s0
+; SDAG-NEXT: v_mov_b32_e32 v37, s1
+; SDAG-NEXT: v_mov_b32_e32 v38, s2
+; SDAG-NEXT: v_mov_b32_e32 v39, s3
+; SDAG-NEXT: v_mov_b32_e32 v13, s25
+; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v16, s28
+; SDAG-NEXT: v_mov_b32_e32 v17, s29
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v29, s17
+; SDAG-NEXT: v_mov_b32_e32 v30, s18
+; SDAG-NEXT: v_mov_b32_e32 v31, s19
+; SDAG-NEXT: v_mov_b32_e32 v32, s20
+; SDAG-NEXT: v_mov_b32_e32 v33, s21
+; SDAG-NEXT: v_mov_b32_e32 v34, s22
+; SDAG-NEXT: v_mov_b32_e32 v35, s23
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v18, v0
+; SDAG-NEXT: v_mov_b32_e32 v19, v1
+; SDAG-NEXT: v_mov_b32_e32 v20, v2
+; SDAG-NEXT: v_mov_b32_e32 v21, v3
+; SDAG-NEXT: v_mov_b32_e32 v22, v4
+; SDAG-NEXT: v_mov_b32_e32 v23, v5
+; SDAG-NEXT: v_mov_b32_e32 v24, v6
+; SDAG-NEXT: v_mov_b32_e32 v25, v7
+; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v11, v0
+; GISEL-NEXT: v_mov_b32_e32 v12, v1
+; GISEL-NEXT: v_mov_b32_e32 v13, v2
+; GISEL-NEXT: v_mov_b32_e32 v14, v3
+; GISEL-NEXT: v_mov_b32_e32 v15, v4
+; GISEL-NEXT: v_mov_b32_e32 v16, v5
+; GISEL-NEXT: v_mov_b32_e32 v17, v6
+; GISEL-NEXT: v_mov_b32_e32 v18, v7
+; GISEL-NEXT: v_mov_b32_e32 v19, v8
+; GISEL-NEXT: v_mov_b32_e32 v20, v9
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-NEXT: v_mov_b32_e32 v21, v10
+; GISEL-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-NEXT: v_mov_b32_e32 v5, s29
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-NEXT: v_mov_b32_e32 v6, v11
+; GISEL-NEXT: v_mov_b32_e32 v7, v12
+; GISEL-NEXT: v_mov_b32_e32 v8, v13
+; GISEL-NEXT: v_mov_b32_e32 v9, v14
+; GISEL-NEXT: v_mov_b32_e32 v10, v15
+; GISEL-NEXT: v_mov_b32_e32 v11, v16
+; GISEL-NEXT: v_mov_b32_e32 v12, v17
+; GISEL-NEXT: v_mov_b32_e32 v13, v18
+; GISEL-NEXT: v_mov_b32_e32 v14, v19
+; GISEL-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[30:33], v[22:29], v21
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
+ ret <16 x i32> %result
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8
+; --------------------------------------------------------------------
+
+declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)
+
+define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
+; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
; SDAG: ; %bb.0: ; %bb
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
@@ -1272,19 +1518,19 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
; GISEL-NEXT: v_mov_b32_e32 v0, s24
; GISEL-NEXT: v_mov_b32_e32 v1, s25
; GISEL-NEXT: v_mov_b32_e32 v2, s26
; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-NEXT: v_mov_b32_e32 v16, s28
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[14:17], v[6:13], v4
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
@@ -1441,19 +1687,19 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
; GISEL-NEXT: v_mov_b32_e32 v0, s24
; GISEL-NEXT: v_mov_b32_e32 v1, s25
; GISEL-NEXT: v_mov_b32_e32 v2, s26
; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-NEXT: v_mov_b32_e32 v16, s28
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[14:17], v[6:13], v4
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
@@ -1610,19 +1856,19 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
; GISEL-NEXT: v_mov_b32_e32 v0, s24
; GISEL-NEXT: v_mov_b32_e32 v1, s25
; GISEL-NEXT: v_mov_b32_e32 v2, s26
; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-NEXT: v_mov_b32_e32 v16, s28
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[14:17], v[6:13], v4
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
@@ -1779,19 +2025,19 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
; GISEL-NEXT: v_mov_b32_e32 v0, s24
; GISEL-NEXT: v_mov_b32_e32 v1, s25
; GISEL-NEXT: v_mov_b32_e32 v2, s26
; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-NEXT: v_mov_b32_e32 v16, s28
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[14:17], v[6:13], v4
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
@@ -1883,151 +2129,168 @@ bb:
}
define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
+; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v26, s0
-; SDAG-NEXT: v_mov_b32_e32 v27, s1
-; SDAG-NEXT: v_mov_b32_e32 v28, s2
-; SDAG-NEXT: v_mov_b32_e32 v29, s3
-; SDAG-NEXT: v_mov_b32_e32 v16, v10
-; SDAG-NEXT: v_mov_b32_e32 v15, v9
-; SDAG-NEXT: v_mov_b32_e32 v14, v8
-; SDAG-NEXT: v_mov_b32_e32 v13, v7
-; SDAG-NEXT: v_mov_b32_e32 v12, v6
-; SDAG-NEXT: v_mov_b32_e32 v11, v5
-; SDAG-NEXT: v_mov_b32_e32 v10, v4
-; SDAG-NEXT: v_mov_b32_e32 v9, v3
-; SDAG-NEXT: v_mov_b32_e32 v8, v2
-; SDAG-NEXT: v_mov_b32_e32 v7, v1
-; SDAG-NEXT: v_mov_b32_e32 v6, v0
-; SDAG-NEXT: v_mov_b32_e32 v0, s24
-; SDAG-NEXT: v_mov_b32_e32 v1, s25
-; SDAG-NEXT: v_mov_b32_e32 v2, s26
-; SDAG-NEXT: v_mov_b32_e32 v3, s27
-; SDAG-NEXT: v_mov_b32_e32 v4, s28
-; SDAG-NEXT: v_mov_b32_e32 v5, s29
-; SDAG-NEXT: v_mov_b32_e32 v18, s16
-; SDAG-NEXT: v_mov_b32_e32 v19, s17
-; SDAG-NEXT: v_mov_b32_e32 v20, s18
-; SDAG-NEXT: v_mov_b32_e32 v21, s19
-; SDAG-NEXT: v_mov_b32_e32 v22, s20
-; SDAG-NEXT: v_mov_b32_e32 v23, s21
-; SDAG-NEXT: v_mov_b32_e32 v24, s22
-; SDAG-NEXT: v_mov_b32_e32 v25, s23
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
+; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v18, v0
-; GISEL-NEXT: v_mov_b32_e32 v19, v1
-; GISEL-NEXT: v_mov_b32_e32 v20, v2
-; GISEL-NEXT: v_mov_b32_e32 v21, v3
-; GISEL-NEXT: v_mov_b32_e32 v22, v4
-; GISEL-NEXT: v_mov_b32_e32 v23, v5
-; GISEL-NEXT: v_mov_b32_e32 v24, v6
-; GISEL-NEXT: v_mov_b32_e32 v25, v7
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v26, v8
-; GISEL-NEXT: v_mov_b32_e32 v27, v9
-; GISEL-NEXT: v_mov_b32_e32 v12, s24
-; GISEL-NEXT: v_mov_b32_e32 v13, s25
-; GISEL-NEXT: v_mov_b32_e32 v14, s26
-; GISEL-NEXT: v_mov_b32_e32 v15, s27
-; GISEL-NEXT: v_mov_b32_e32 v16, s28
-; GISEL-NEXT: v_mov_b32_e32 v17, s29
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23]
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[28:31], v[0:7], v10
-; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
; GISEL-NEXT: v_mov_b32_e32 v0, v12
; GISEL-NEXT: v_mov_b32_e32 v1, v13
; GISEL-NEXT: v_mov_b32_e32 v2, v14
@@ -2044,6 +2307,104 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: v_mov_b32_e32 v13, v25
; GISEL-NEXT: v_mov_b32_e32 v14, v26
; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
+; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v36, s0
+; SDAG-NEXT: v_mov_b32_e32 v37, s1
+; SDAG-NEXT: v_mov_b32_e32 v38, s2
+; SDAG-NEXT: v_mov_b32_e32 v39, s3
+; SDAG-NEXT: v_mov_b32_e32 v13, s25
+; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v16, s28
+; SDAG-NEXT: v_mov_b32_e32 v17, s29
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v29, s17
+; SDAG-NEXT: v_mov_b32_e32 v30, s18
+; SDAG-NEXT: v_mov_b32_e32 v31, s19
+; SDAG-NEXT: v_mov_b32_e32 v32, s20
+; SDAG-NEXT: v_mov_b32_e32 v33, s21
+; SDAG-NEXT: v_mov_b32_e32 v34, s22
+; SDAG-NEXT: v_mov_b32_e32 v35, s23
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v18, v0
+; SDAG-NEXT: v_mov_b32_e32 v19, v1
+; SDAG-NEXT: v_mov_b32_e32 v20, v2
+; SDAG-NEXT: v_mov_b32_e32 v21, v3
+; SDAG-NEXT: v_mov_b32_e32 v22, v4
+; SDAG-NEXT: v_mov_b32_e32 v23, v5
+; SDAG-NEXT: v_mov_b32_e32 v24, v6
+; SDAG-NEXT: v_mov_b32_e32 v25, v7
+; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v11, v0
+; GISEL-NEXT: v_mov_b32_e32 v12, v1
+; GISEL-NEXT: v_mov_b32_e32 v13, v2
+; GISEL-NEXT: v_mov_b32_e32 v14, v3
+; GISEL-NEXT: v_mov_b32_e32 v15, v4
+; GISEL-NEXT: v_mov_b32_e32 v16, v5
+; GISEL-NEXT: v_mov_b32_e32 v17, v6
+; GISEL-NEXT: v_mov_b32_e32 v18, v7
+; GISEL-NEXT: v_mov_b32_e32 v19, v8
+; GISEL-NEXT: v_mov_b32_e32 v20, v9
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-NEXT: v_mov_b32_e32 v21, v10
+; GISEL-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-NEXT: v_mov_b32_e32 v5, s29
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-NEXT: v_mov_b32_e32 v6, v11
+; GISEL-NEXT: v_mov_b32_e32 v7, v12
+; GISEL-NEXT: v_mov_b32_e32 v8, v13
+; GISEL-NEXT: v_mov_b32_e32 v9, v14
+; GISEL-NEXT: v_mov_b32_e32 v10, v15
+; GISEL-NEXT: v_mov_b32_e32 v11, v16
+; GISEL-NEXT: v_mov_b32_e32 v12, v17
+; GISEL-NEXT: v_mov_b32_e32 v13, v18
+; GISEL-NEXT: v_mov_b32_e32 v14, v19
+; GISEL-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
@@ -2135,151 +2496,168 @@ bb:
}
define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
+; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v26, s0
-; SDAG-NEXT: v_mov_b32_e32 v27, s1
-; SDAG-NEXT: v_mov_b32_e32 v28, s2
-; SDAG-NEXT: v_mov_b32_e32 v29, s3
-; SDAG-NEXT: v_mov_b32_e32 v16, v10
-; SDAG-NEXT: v_mov_b32_e32 v15, v9
-; SDAG-NEXT: v_mov_b32_e32 v14, v8
-; SDAG-NEXT: v_mov_b32_e32 v13, v7
-; SDAG-NEXT: v_mov_b32_e32 v12, v6
-; SDAG-NEXT: v_mov_b32_e32 v11, v5
-; SDAG-NEXT: v_mov_b32_e32 v10, v4
-; SDAG-NEXT: v_mov_b32_e32 v9, v3
-; SDAG-NEXT: v_mov_b32_e32 v8, v2
-; SDAG-NEXT: v_mov_b32_e32 v7, v1
-; SDAG-NEXT: v_mov_b32_e32 v6, v0
-; SDAG-NEXT: v_mov_b32_e32 v0, s24
-; SDAG-NEXT: v_mov_b32_e32 v1, s25
-; SDAG-NEXT: v_mov_b32_e32 v2, s26
-; SDAG-NEXT: v_mov_b32_e32 v3, s27
-; SDAG-NEXT: v_mov_b32_e32 v4, s28
-; SDAG-NEXT: v_mov_b32_e32 v5, s29
-; SDAG-NEXT: v_mov_b32_e32 v18, s16
-; SDAG-NEXT: v_mov_b32_e32 v19, s17
-; SDAG-NEXT: v_mov_b32_e32 v20, s18
-; SDAG-NEXT: v_mov_b32_e32 v21, s19
-; SDAG-NEXT: v_mov_b32_e32 v22, s20
-; SDAG-NEXT: v_mov_b32_e32 v23, s21
-; SDAG-NEXT: v_mov_b32_e32 v24, s22
-; SDAG-NEXT: v_mov_b32_e32 v25, s23
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
+; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v18, v0
-; GISEL-NEXT: v_mov_b32_e32 v19, v1
-; GISEL-NEXT: v_mov_b32_e32 v20, v2
-; GISEL-NEXT: v_mov_b32_e32 v21, v3
-; GISEL-NEXT: v_mov_b32_e32 v22, v4
-; GISEL-NEXT: v_mov_b32_e32 v23, v5
-; GISEL-NEXT: v_mov_b32_e32 v24, v6
-; GISEL-NEXT: v_mov_b32_e32 v25, v7
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v26, v8
-; GISEL-NEXT: v_mov_b32_e32 v27, v9
-; GISEL-NEXT: v_mov_b32_e32 v12, s24
-; GISEL-NEXT: v_mov_b32_e32 v13, s25
-; GISEL-NEXT: v_mov_b32_e32 v14, s26
-; GISEL-NEXT: v_mov_b32_e32 v15, s27
-; GISEL-NEXT: v_mov_b32_e32 v16, s28
-; GISEL-NEXT: v_mov_b32_e32 v17, s29
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23]
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[28:31], v[0:7], v10
-; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
; GISEL-NEXT: v_mov_b32_e32 v0, v12
; GISEL-NEXT: v_mov_b32_e32 v1, v13
; GISEL-NEXT: v_mov_b32_e32 v2, v14
@@ -2296,6 +2674,104 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: v_mov_b32_e32 v13, v25
; GISEL-NEXT: v_mov_b32_e32 v14, v26
; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
+; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v36, s0
+; SDAG-NEXT: v_mov_b32_e32 v37, s1
+; SDAG-NEXT: v_mov_b32_e32 v38, s2
+; SDAG-NEXT: v_mov_b32_e32 v39, s3
+; SDAG-NEXT: v_mov_b32_e32 v13, s25
+; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v16, s28
+; SDAG-NEXT: v_mov_b32_e32 v17, s29
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v29, s17
+; SDAG-NEXT: v_mov_b32_e32 v30, s18
+; SDAG-NEXT: v_mov_b32_e32 v31, s19
+; SDAG-NEXT: v_mov_b32_e32 v32, s20
+; SDAG-NEXT: v_mov_b32_e32 v33, s21
+; SDAG-NEXT: v_mov_b32_e32 v34, s22
+; SDAG-NEXT: v_mov_b32_e32 v35, s23
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v18, v0
+; SDAG-NEXT: v_mov_b32_e32 v19, v1
+; SDAG-NEXT: v_mov_b32_e32 v20, v2
+; SDAG-NEXT: v_mov_b32_e32 v21, v3
+; SDAG-NEXT: v_mov_b32_e32 v22, v4
+; SDAG-NEXT: v_mov_b32_e32 v23, v5
+; SDAG-NEXT: v_mov_b32_e32 v24, v6
+; SDAG-NEXT: v_mov_b32_e32 v25, v7
+; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v11, v0
+; GISEL-NEXT: v_mov_b32_e32 v12, v1
+; GISEL-NEXT: v_mov_b32_e32 v13, v2
+; GISEL-NEXT: v_mov_b32_e32 v14, v3
+; GISEL-NEXT: v_mov_b32_e32 v15, v4
+; GISEL-NEXT: v_mov_b32_e32 v16, v5
+; GISEL-NEXT: v_mov_b32_e32 v17, v6
+; GISEL-NEXT: v_mov_b32_e32 v18, v7
+; GISEL-NEXT: v_mov_b32_e32 v19, v8
+; GISEL-NEXT: v_mov_b32_e32 v20, v9
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-NEXT: v_mov_b32_e32 v21, v10
+; GISEL-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-NEXT: v_mov_b32_e32 v5, s29
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-NEXT: v_mov_b32_e32 v6, v11
+; GISEL-NEXT: v_mov_b32_e32 v7, v12
+; GISEL-NEXT: v_mov_b32_e32 v8, v13
+; GISEL-NEXT: v_mov_b32_e32 v9, v14
+; GISEL-NEXT: v_mov_b32_e32 v10, v15
+; GISEL-NEXT: v_mov_b32_e32 v11, v16
+; GISEL-NEXT: v_mov_b32_e32 v12, v17
+; GISEL-NEXT: v_mov_b32_e32 v13, v18
+; GISEL-NEXT: v_mov_b32_e32 v14, v19
+; GISEL-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
@@ -2387,151 +2863,168 @@ bb:
}
define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
+; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v26, s0
-; SDAG-NEXT: v_mov_b32_e32 v27, s1
-; SDAG-NEXT: v_mov_b32_e32 v28, s2
-; SDAG-NEXT: v_mov_b32_e32 v29, s3
-; SDAG-NEXT: v_mov_b32_e32 v16, v10
-; SDAG-NEXT: v_mov_b32_e32 v15, v9
-; SDAG-NEXT: v_mov_b32_e32 v14, v8
-; SDAG-NEXT: v_mov_b32_e32 v13, v7
-; SDAG-NEXT: v_mov_b32_e32 v12, v6
-; SDAG-NEXT: v_mov_b32_e32 v11, v5
-; SDAG-NEXT: v_mov_b32_e32 v10, v4
-; SDAG-NEXT: v_mov_b32_e32 v9, v3
-; SDAG-NEXT: v_mov_b32_e32 v8, v2
-; SDAG-NEXT: v_mov_b32_e32 v7, v1
-; SDAG-NEXT: v_mov_b32_e32 v6, v0
-; SDAG-NEXT: v_mov_b32_e32 v0, s24
-; SDAG-NEXT: v_mov_b32_e32 v1, s25
-; SDAG-NEXT: v_mov_b32_e32 v2, s26
-; SDAG-NEXT: v_mov_b32_e32 v3, s27
-; SDAG-NEXT: v_mov_b32_e32 v4, s28
-; SDAG-NEXT: v_mov_b32_e32 v5, s29
-; SDAG-NEXT: v_mov_b32_e32 v18, s16
-; SDAG-NEXT: v_mov_b32_e32 v19, s17
-; SDAG-NEXT: v_mov_b32_e32 v20, s18
-; SDAG-NEXT: v_mov_b32_e32 v21, s19
-; SDAG-NEXT: v_mov_b32_e32 v22, s20
-; SDAG-NEXT: v_mov_b32_e32 v23, s21
-; SDAG-NEXT: v_mov_b32_e32 v24, s22
-; SDAG-NEXT: v_mov_b32_e32 v25, s23
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
+; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v18, v0
-; GISEL-NEXT: v_mov_b32_e32 v19, v1
-; GISEL-NEXT: v_mov_b32_e32 v20, v2
-; GISEL-NEXT: v_mov_b32_e32 v21, v3
-; GISEL-NEXT: v_mov_b32_e32 v22, v4
-; GISEL-NEXT: v_mov_b32_e32 v23, v5
-; GISEL-NEXT: v_mov_b32_e32 v24, v6
-; GISEL-NEXT: v_mov_b32_e32 v25, v7
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v26, v8
-; GISEL-NEXT: v_mov_b32_e32 v27, v9
-; GISEL-NEXT: v_mov_b32_e32 v12, s24
-; GISEL-NEXT: v_mov_b32_e32 v13, s25
-; GISEL-NEXT: v_mov_b32_e32 v14, s26
-; GISEL-NEXT: v_mov_b32_e32 v15, s27
-; GISEL-NEXT: v_mov_b32_e32 v16, s28
-; GISEL-NEXT: v_mov_b32_e32 v17, s29
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23]
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[28:31], v[0:7], v10
-; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
; GISEL-NEXT: v_mov_b32_e32 v0, v12
; GISEL-NEXT: v_mov_b32_e32 v1, v13
; GISEL-NEXT: v_mov_b32_e32 v2, v14
@@ -2548,6 +3041,104 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: v_mov_b32_e32 v13, v25
; GISEL-NEXT: v_mov_b32_e32 v14, v26
; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
+; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v36, s0
+; SDAG-NEXT: v_mov_b32_e32 v37, s1
+; SDAG-NEXT: v_mov_b32_e32 v38, s2
+; SDAG-NEXT: v_mov_b32_e32 v39, s3
+; SDAG-NEXT: v_mov_b32_e32 v13, s25
+; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v16, s28
+; SDAG-NEXT: v_mov_b32_e32 v17, s29
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v29, s17
+; SDAG-NEXT: v_mov_b32_e32 v30, s18
+; SDAG-NEXT: v_mov_b32_e32 v31, s19
+; SDAG-NEXT: v_mov_b32_e32 v32, s20
+; SDAG-NEXT: v_mov_b32_e32 v33, s21
+; SDAG-NEXT: v_mov_b32_e32 v34, s22
+; SDAG-NEXT: v_mov_b32_e32 v35, s23
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v18, v0
+; SDAG-NEXT: v_mov_b32_e32 v19, v1
+; SDAG-NEXT: v_mov_b32_e32 v20, v2
+; SDAG-NEXT: v_mov_b32_e32 v21, v3
+; SDAG-NEXT: v_mov_b32_e32 v22, v4
+; SDAG-NEXT: v_mov_b32_e32 v23, v5
+; SDAG-NEXT: v_mov_b32_e32 v24, v6
+; SDAG-NEXT: v_mov_b32_e32 v25, v7
+; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v11, v0
+; GISEL-NEXT: v_mov_b32_e32 v12, v1
+; GISEL-NEXT: v_mov_b32_e32 v13, v2
+; GISEL-NEXT: v_mov_b32_e32 v14, v3
+; GISEL-NEXT: v_mov_b32_e32 v15, v4
+; GISEL-NEXT: v_mov_b32_e32 v16, v5
+; GISEL-NEXT: v_mov_b32_e32 v17, v6
+; GISEL-NEXT: v_mov_b32_e32 v18, v7
+; GISEL-NEXT: v_mov_b32_e32 v19, v8
+; GISEL-NEXT: v_mov_b32_e32 v20, v9
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-NEXT: v_mov_b32_e32 v21, v10
+; GISEL-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-NEXT: v_mov_b32_e32 v5, s29
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-NEXT: v_mov_b32_e32 v6, v11
+; GISEL-NEXT: v_mov_b32_e32 v7, v12
+; GISEL-NEXT: v_mov_b32_e32 v8, v13
+; GISEL-NEXT: v_mov_b32_e32 v9, v14
+; GISEL-NEXT: v_mov_b32_e32 v10, v15
+; GISEL-NEXT: v_mov_b32_e32 v11, v16
+; GISEL-NEXT: v_mov_b32_e32 v12, v17
+; GISEL-NEXT: v_mov_b32_e32 v13, v18
+; GISEL-NEXT: v_mov_b32_e32 v14, v19
+; GISEL-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
@@ -2639,151 +3230,168 @@ bb:
}
define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
}
define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
-; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
- ret <16 x float> %result
-}
-
-define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
+; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v26, s0
-; SDAG-NEXT: v_mov_b32_e32 v27, s1
-; SDAG-NEXT: v_mov_b32_e32 v28, s2
-; SDAG-NEXT: v_mov_b32_e32 v29, s3
-; SDAG-NEXT: v_mov_b32_e32 v16, v10
-; SDAG-NEXT: v_mov_b32_e32 v15, v9
-; SDAG-NEXT: v_mov_b32_e32 v14, v8
-; SDAG-NEXT: v_mov_b32_e32 v13, v7
-; SDAG-NEXT: v_mov_b32_e32 v12, v6
-; SDAG-NEXT: v_mov_b32_e32 v11, v5
-; SDAG-NEXT: v_mov_b32_e32 v10, v4
-; SDAG-NEXT: v_mov_b32_e32 v9, v3
-; SDAG-NEXT: v_mov_b32_e32 v8, v2
-; SDAG-NEXT: v_mov_b32_e32 v7, v1
-; SDAG-NEXT: v_mov_b32_e32 v6, v0
-; SDAG-NEXT: v_mov_b32_e32 v0, s24
-; SDAG-NEXT: v_mov_b32_e32 v1, s25
-; SDAG-NEXT: v_mov_b32_e32 v2, s26
-; SDAG-NEXT: v_mov_b32_e32 v3, s27
-; SDAG-NEXT: v_mov_b32_e32 v4, s28
-; SDAG-NEXT: v_mov_b32_e32 v5, s29
-; SDAG-NEXT: v_mov_b32_e32 v18, s16
-; SDAG-NEXT: v_mov_b32_e32 v19, s17
-; SDAG-NEXT: v_mov_b32_e32 v20, s18
-; SDAG-NEXT: v_mov_b32_e32 v21, s19
-; SDAG-NEXT: v_mov_b32_e32 v22, s20
-; SDAG-NEXT: v_mov_b32_e32 v23, s21
-; SDAG-NEXT: v_mov_b32_e32 v24, s22
-; SDAG-NEXT: v_mov_b32_e32 v25, s23
-; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
+; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v18, v0
-; GISEL-NEXT: v_mov_b32_e32 v19, v1
-; GISEL-NEXT: v_mov_b32_e32 v20, v2
-; GISEL-NEXT: v_mov_b32_e32 v21, v3
-; GISEL-NEXT: v_mov_b32_e32 v22, v4
-; GISEL-NEXT: v_mov_b32_e32 v23, v5
-; GISEL-NEXT: v_mov_b32_e32 v24, v6
-; GISEL-NEXT: v_mov_b32_e32 v25, v7
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v26, v8
-; GISEL-NEXT: v_mov_b32_e32 v27, v9
-; GISEL-NEXT: v_mov_b32_e32 v12, s24
-; GISEL-NEXT: v_mov_b32_e32 v13, s25
-; GISEL-NEXT: v_mov_b32_e32 v14, s26
-; GISEL-NEXT: v_mov_b32_e32 v15, s27
-; GISEL-NEXT: v_mov_b32_e32 v16, s28
-; GISEL-NEXT: v_mov_b32_e32 v17, s29
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[22:23]
-; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[28:31], v[0:7], v10
-; GISEL-NEXT: s_nop 11
+; GISEL-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
; GISEL-NEXT: v_mov_b32_e32 v0, v12
; GISEL-NEXT: v_mov_b32_e32 v1, v13
; GISEL-NEXT: v_mov_b32_e32 v2, v14
@@ -2800,6 +3408,104 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: v_mov_b32_e32 v13, v25
; GISEL-NEXT: v_mov_b32_e32 v14, v26
; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
+ ret <16 x float> %result
+}
+
+define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
+; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v36, s0
+; SDAG-NEXT: v_mov_b32_e32 v37, s1
+; SDAG-NEXT: v_mov_b32_e32 v38, s2
+; SDAG-NEXT: v_mov_b32_e32 v39, s3
+; SDAG-NEXT: v_mov_b32_e32 v13, s25
+; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v16, s28
+; SDAG-NEXT: v_mov_b32_e32 v17, s29
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-NEXT: v_mov_b32_e32 v29, s17
+; SDAG-NEXT: v_mov_b32_e32 v30, s18
+; SDAG-NEXT: v_mov_b32_e32 v31, s19
+; SDAG-NEXT: v_mov_b32_e32 v32, s20
+; SDAG-NEXT: v_mov_b32_e32 v33, s21
+; SDAG-NEXT: v_mov_b32_e32 v34, s22
+; SDAG-NEXT: v_mov_b32_e32 v35, s23
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v18, v0
+; SDAG-NEXT: v_mov_b32_e32 v19, v1
+; SDAG-NEXT: v_mov_b32_e32 v20, v2
+; SDAG-NEXT: v_mov_b32_e32 v21, v3
+; SDAG-NEXT: v_mov_b32_e32 v22, v4
+; SDAG-NEXT: v_mov_b32_e32 v23, v5
+; SDAG-NEXT: v_mov_b32_e32 v24, v6
+; SDAG-NEXT: v_mov_b32_e32 v25, v7
+; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v11, v0
+; GISEL-NEXT: v_mov_b32_e32 v12, v1
+; GISEL-NEXT: v_mov_b32_e32 v13, v2
+; GISEL-NEXT: v_mov_b32_e32 v14, v3
+; GISEL-NEXT: v_mov_b32_e32 v15, v4
+; GISEL-NEXT: v_mov_b32_e32 v16, v5
+; GISEL-NEXT: v_mov_b32_e32 v17, v6
+; GISEL-NEXT: v_mov_b32_e32 v18, v7
+; GISEL-NEXT: v_mov_b32_e32 v19, v8
+; GISEL-NEXT: v_mov_b32_e32 v20, v9
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-NEXT: v_mov_b32_e32 v21, v10
+; GISEL-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-NEXT: v_mov_b32_e32 v5, s29
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-NEXT: v_mov_b32_e32 v6, v11
+; GISEL-NEXT: v_mov_b32_e32 v7, v12
+; GISEL-NEXT: v_mov_b32_e32 v8, v13
+; GISEL-NEXT: v_mov_b32_e32 v9, v14
+; GISEL-NEXT: v_mov_b32_e32 v10, v15
+; GISEL-NEXT: v_mov_b32_e32 v11, v16
+; GISEL-NEXT: v_mov_b32_e32 v12, v17
+; GISEL-NEXT: v_mov_b32_e32 v13, v18
+; GISEL-NEXT: v_mov_b32_e32 v14, v19
+; GISEL-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
index d3e171be10802..4366472c73a0e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
@@ -246,6 +246,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[8:11], s12 idxen offen
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr7
+; GFX90A-NEXT: ; implicit-def: $vgpr0
; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB4_1
@@ -279,6 +280,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX942-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[4:7], s8 idxen offen
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX942-NEXT: ; implicit-def: $vgpr7
+; GFX942-NEXT: ; implicit-def: $vgpr0
; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB4_1
@@ -418,6 +420,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr
; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[8:11], s12 idxen offen
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr7
+; GFX90A-NEXT: ; implicit-def: $vgpr0
; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
@@ -451,6 +454,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr
; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s8 idxen offen
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX942-NEXT: ; implicit-def: $vgpr7
+; GFX942-NEXT: ; implicit-def: $vgpr0
; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX942-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB5_1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
index 5b72e006072df..0191a85b33888 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
@@ -193,8 +193,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: buffer_atomic_add_f32 v1, v[8:9], s[8:11], s12 idxen offen glc
+; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[8:11], s12 idxen offen glc
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr7
; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9
@@ -203,7 +202,6 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
@@ -229,8 +227,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX942-NEXT: s_and_b64 s[0:1], s[0:1], vcc
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: buffer_atomic_add_f32 v1, v[8:9], s[4:7], s8 idxen offen sc0
+; GFX942-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[4:7], s8 idxen offen sc0
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX942-NEXT: ; implicit-def: $vgpr7
; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9
@@ -239,7 +236,6 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
@@ -343,8 +339,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__
; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: buffer_atomic_pk_add_f16 v1, v[8:9], s[8:11], s12 idxen offen glc
+; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[8:11], s12 idxen offen glc
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr7
; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9
@@ -353,7 +348,6 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
@@ -379,8 +373,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__
; GFX942-NEXT: s_and_b64 s[0:1], s[0:1], vcc
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-NEXT: buffer_atomic_pk_add_f16 v1, v[8:9], s[4:7], s8 idxen offen sc0
+; GFX942-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s8 idxen offen sc0
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX942-NEXT: ; implicit-def: $vgpr7
; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9
@@ -389,7 +382,6 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: s_mov_b64 exec, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index 1c04ff3e83326..9dac2393fd966 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -85,7 +85,7 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX942-SDAG-NEXT: s_mov_b32 s3, s12
; GFX942-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX942-SDAG-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen nt
+; GFX942-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen nt
; GFX942-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
; GFX942-SDAG-NEXT: s_mov_b32 s5, s12
@@ -96,9 +96,9 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX942-SDAG-NEXT: s_mov_b32 s2, s1
; GFX942-SDAG-NEXT: s_mov_b32 s3, s12
; GFX942-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX942-SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen nt
+; GFX942-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen nt
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: buffer_nontemporal_load_store:
@@ -115,7 +115,7 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX942-GISEL-NEXT: s_mov_b32 s6, s3
; GFX942-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX942-GISEL-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen nt
+; GFX942-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen nt
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
; GFX942-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30
; GFX942-GISEL-NEXT: s_mov_b32 s4, s7
@@ -126,9 +126,9 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
; GFX942-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; GFX942-GISEL-NEXT: s_mov_b32 s6, s3
; GFX942-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX942-GISEL-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen nt
+; GFX942-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen nt
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: buffer_nontemporal_load_store:
@@ -413,7 +413,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-SDAG-NEXT: s_mov_b32 s3, s12
; GFX942-SDAG-NEXT: s_or_b64 s[8:9], s[2:3], s[12:13]
; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX942-SDAG-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
; GFX942-SDAG-NEXT: s_load_dword s13, s[4:5], 0x30
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
; GFX942-SDAG-NEXT: s_mov_b32 s5, s12
@@ -424,9 +424,9 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-SDAG-NEXT: s_mov_b32 s2, s1
; GFX942-SDAG-NEXT: s_mov_b32 s3, s12
; GFX942-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s0
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX942-SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -443,7 +443,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-GISEL-NEXT: s_mov_b32 s6, s3
; GFX942-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX942-GISEL-NEXT: buffer_load_dword v1, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
; GFX942-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30
; GFX942-GISEL-NEXT: s_mov_b32 s4, s7
@@ -454,9 +454,9 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
; GFX942-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; GFX942-GISEL-NEXT: s_mov_b32 s6, s3
; GFX942-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX942-GISEL-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen sc0 sc1
+; GFX942-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
index 3c4a29c54928d..9585c486aeb9e 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GFX908 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GFX90ADAG,GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GFX90AGSEL,GFX90A %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942DAG,GFX942 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942GSEL,GFX942 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
@@ -86,254 +86,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg)
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; GFX908-NEXT: s_endpgm
-;
-; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_vgpr:
-; GFX90ADAG: ; %bb.0: ; %bb
-; GFX90ADAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX90ADAG-NEXT: v_mov_b32_e32 v33, 1.0
-; GFX90ADAG-NEXT: v_mov_b32_e32 v34, 2.0
-; GFX90ADAG-NEXT: v_mov_b32_e32 v32, 0
-; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90ADAG-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
-; GFX90ADAG-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
-; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90ADAG-NEXT: v_mov_b32_e32 v0, s16
-; GFX90ADAG-NEXT: v_mov_b32_e32 v1, s17
-; GFX90ADAG-NEXT: v_mov_b32_e32 v2, s18
-; GFX90ADAG-NEXT: v_mov_b32_e32 v3, s19
-; GFX90ADAG-NEXT: v_mov_b32_e32 v4, s20
-; GFX90ADAG-NEXT: v_mov_b32_e32 v5, s21
-; GFX90ADAG-NEXT: v_mov_b32_e32 v6, s22
-; GFX90ADAG-NEXT: v_mov_b32_e32 v7, s23
-; GFX90ADAG-NEXT: v_mov_b32_e32 v8, s24
-; GFX90ADAG-NEXT: v_mov_b32_e32 v9, s25
-; GFX90ADAG-NEXT: v_mov_b32_e32 v10, s26
-; GFX90ADAG-NEXT: v_mov_b32_e32 v11, s27
-; GFX90ADAG-NEXT: v_mov_b32_e32 v12, s28
-; GFX90ADAG-NEXT: v_mov_b32_e32 v13, s29
-; GFX90ADAG-NEXT: v_mov_b32_e32 v14, s30
-; GFX90ADAG-NEXT: v_mov_b32_e32 v15, s31
-; GFX90ADAG-NEXT: v_mov_b32_e32 v16, s0
-; GFX90ADAG-NEXT: v_mov_b32_e32 v17, s1
-; GFX90ADAG-NEXT: v_mov_b32_e32 v18, s2
-; GFX90ADAG-NEXT: v_mov_b32_e32 v19, s3
-; GFX90ADAG-NEXT: v_mov_b32_e32 v20, s4
-; GFX90ADAG-NEXT: v_mov_b32_e32 v21, s5
-; GFX90ADAG-NEXT: v_mov_b32_e32 v22, s6
-; GFX90ADAG-NEXT: v_mov_b32_e32 v23, s7
-; GFX90ADAG-NEXT: v_mov_b32_e32 v24, s8
-; GFX90ADAG-NEXT: v_mov_b32_e32 v25, s9
-; GFX90ADAG-NEXT: v_mov_b32_e32 v26, s10
-; GFX90ADAG-NEXT: v_mov_b32_e32 v27, s11
-; GFX90ADAG-NEXT: v_mov_b32_e32 v28, s12
-; GFX90ADAG-NEXT: v_mov_b32_e32 v29, s13
-; GFX90ADAG-NEXT: v_mov_b32_e32 v30, s14
-; GFX90ADAG-NEXT: v_mov_b32_e32 v31, s15
-; GFX90ADAG-NEXT: s_nop 1
-; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
-; GFX90ADAG-NEXT: s_nop 15
-; GFX90ADAG-NEXT: s_nop 2
-; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
-; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
-; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
-; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
-; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
-; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
-; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
-; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
-; GFX90ADAG-NEXT: s_endpgm
-;
-; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_vgpr:
-; GFX90AGSEL: ; %bb.0: ; %bb
-; GFX90AGSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v32, 1.0
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v33, 2.0
-; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90AGSEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0
-; GFX90AGSEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40
-; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90AGSEL-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90AGSEL-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90AGSEL-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90AGSEL-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90AGSEL-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90AGSEL-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
-; GFX90AGSEL-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
-; GFX90AGSEL-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
-; GFX90AGSEL-NEXT: v_pk_mov_b32 v[16:17], s[16:17], s[16:17] op_sel:[0,1]
-; GFX90AGSEL-NEXT: v_pk_mov_b32 v[18:19], s[18:19], s[18:19] op_sel:[0,1]
-; GFX90AGSEL-NEXT: v_pk_mov_b32 v[20:21], s[20:21], s[20:21] op_sel:[0,1]
-; GFX90AGSEL-NEXT: v_pk_mov_b32 v[22:23], s[22:23], s[22:23] op_sel:[0,1]
-; GFX90AGSEL-NEXT: v_pk_mov_b32 v[24:25], s[24:25], s[24:25] op_sel:[0,1]
-; GFX90AGSEL-NEXT: v_pk_mov_b32 v[26:27], s[26:27], s[26:27] op_sel:[0,1]
-; GFX90AGSEL-NEXT: v_pk_mov_b32 v[28:29], s[28:29], s[28:29] op_sel:[0,1]
-; GFX90AGSEL-NEXT: v_pk_mov_b32 v[30:31], s[30:31], s[30:31] op_sel:[0,1]
-; GFX90AGSEL-NEXT: s_nop 1
-; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31]
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v32, 0
-; GFX90AGSEL-NEXT: s_nop 15
-; GFX90AGSEL-NEXT: s_nop 1
-; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
-; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
-; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
-; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
-; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
-; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
-; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
-; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
-; GFX90AGSEL-NEXT: s_endpgm
-;
-; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_vgpr:
-; GFX942DAG: ; %bb.0: ; %bb
-; GFX942DAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX942DAG-NEXT: v_mov_b32_e32 v33, 1.0
-; GFX942DAG-NEXT: v_mov_b32_e32 v34, 2.0
-; GFX942DAG-NEXT: v_mov_b32_e32 v32, 0
-; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942DAG-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
-; GFX942DAG-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
-; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942DAG-NEXT: v_mov_b32_e32 v0, s16
-; GFX942DAG-NEXT: v_mov_b32_e32 v1, s17
-; GFX942DAG-NEXT: v_mov_b32_e32 v2, s18
-; GFX942DAG-NEXT: v_mov_b32_e32 v3, s19
-; GFX942DAG-NEXT: v_mov_b32_e32 v4, s20
-; GFX942DAG-NEXT: v_mov_b32_e32 v5, s21
-; GFX942DAG-NEXT: v_mov_b32_e32 v6, s22
-; GFX942DAG-NEXT: v_mov_b32_e32 v7, s23
-; GFX942DAG-NEXT: v_mov_b32_e32 v8, s24
-; GFX942DAG-NEXT: v_mov_b32_e32 v9, s25
-; GFX942DAG-NEXT: v_mov_b32_e32 v10, s26
-; GFX942DAG-NEXT: v_mov_b32_e32 v11, s27
-; GFX942DAG-NEXT: v_mov_b32_e32 v12, s28
-; GFX942DAG-NEXT: v_mov_b32_e32 v13, s29
-; GFX942DAG-NEXT: v_mov_b32_e32 v14, s30
-; GFX942DAG-NEXT: v_mov_b32_e32 v15, s31
-; GFX942DAG-NEXT: v_mov_b32_e32 v16, s0
-; GFX942DAG-NEXT: v_mov_b32_e32 v17, s1
-; GFX942DAG-NEXT: v_mov_b32_e32 v18, s2
-; GFX942DAG-NEXT: v_mov_b32_e32 v19, s3
-; GFX942DAG-NEXT: v_mov_b32_e32 v20, s4
-; GFX942DAG-NEXT: v_mov_b32_e32 v21, s5
-; GFX942DAG-NEXT: v_mov_b32_e32 v22, s6
-; GFX942DAG-NEXT: v_mov_b32_e32 v23, s7
-; GFX942DAG-NEXT: v_mov_b32_e32 v24, s8
-; GFX942DAG-NEXT: v_mov_b32_e32 v25, s9
-; GFX942DAG-NEXT: v_mov_b32_e32 v26, s10
-; GFX942DAG-NEXT: v_mov_b32_e32 v27, s11
-; GFX942DAG-NEXT: v_mov_b32_e32 v28, s12
-; GFX942DAG-NEXT: v_mov_b32_e32 v29, s13
-; GFX942DAG-NEXT: v_mov_b32_e32 v30, s14
-; GFX942DAG-NEXT: v_mov_b32_e32 v31, s15
-; GFX942DAG-NEXT: s_nop 1
-; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31]
-; GFX942DAG-NEXT: s_nop 15
-; GFX942DAG-NEXT: s_nop 1
-; GFX942DAG-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
-; GFX942DAG-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
-; GFX942DAG-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
-; GFX942DAG-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
-; GFX942DAG-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
-; GFX942DAG-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
-; GFX942DAG-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
-; GFX942DAG-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
-; GFX942DAG-NEXT: s_endpgm
-;
-; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_vgpr:
-; GFX942GSEL: ; %bb.0: ; %bb
-; GFX942GSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX942GSEL-NEXT: v_mov_b32_e32 v32, 1.0
-; GFX942GSEL-NEXT: v_mov_b32_e32 v33, 2.0
-; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942GSEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0
-; GFX942GSEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40
-; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942GSEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942GSEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942GSEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942GSEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942GSEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942GSEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942GSEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942GSEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942GSEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GFX942GSEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GFX942GSEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GFX942GSEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GFX942GSEL-NEXT: v_mov_b64_e32 v[24:25], s[24:25]
-; GFX942GSEL-NEXT: v_mov_b64_e32 v[26:27], s[26:27]
-; GFX942GSEL-NEXT: v_mov_b64_e32 v[28:29], s[28:29]
-; GFX942GSEL-NEXT: v_mov_b64_e32 v[30:31], s[30:31]
-; GFX942GSEL-NEXT: s_nop 1
-; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
-; GFX942GSEL-NEXT: v_mov_b32_e32 v32, 0
-; GFX942GSEL-NEXT: s_nop 15
-; GFX942GSEL-NEXT: s_nop 0
-; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
-; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
-; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
-; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
-; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
-; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
-; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
-; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
-; GFX942GSEL-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -420,286 +228,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg)
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; GFX908-NEXT: s_endpgm
-;
-; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_agpr:
-; GFX90ADAG: ; %bb.0: ; %bb
-; GFX90ADAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX90ADAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90ADAG-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
-; GFX90ADAG-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
-; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a0, s16
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a1, s17
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a2, s18
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a3, s19
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a4, s20
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a5, s21
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a6, s22
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a7, s23
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a8, s24
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a9, s25
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a10, s26
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a11, s27
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a12, s28
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a13, s29
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a14, s30
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a15, s31
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a16, s0
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a17, s1
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a18, s2
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a19, s3
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a20, s4
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a21, s5
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a22, s6
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a23, s7
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a24, s8
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a25, s9
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a26, s10
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a27, s11
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a28, s12
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a29, s13
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a30, s14
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a31, s15
-; GFX90ADAG-NEXT: s_nop 1
-; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GFX90ADAG-NEXT: s_nop 15
-; GFX90ADAG-NEXT: s_nop 2
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
-; GFX90ADAG-NEXT: s_endpgm
-;
-; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_agpr:
-; GFX90AGSEL: ; %bb.0: ; %bb
-; GFX90AGSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90AGSEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0
-; GFX90AGSEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40
-; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a16, s16
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a15, s15
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a17, s17
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a18, s18
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a19, s19
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a20, s20
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a21, s21
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a22, s22
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a23, s23
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a24, s24
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a25, s25
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a26, s26
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a27, s27
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a28, s28
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a29, s29
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a30, s30
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a31, s31
-; GFX90AGSEL-NEXT: s_nop 1
-; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX90AGSEL-NEXT: s_nop 15
-; GFX90AGSEL-NEXT: s_nop 1
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
-; GFX90AGSEL-NEXT: s_endpgm
-;
-; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_agpr:
-; GFX942DAG: ; %bb.0: ; %bb
-; GFX942DAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX942DAG-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX942DAG-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX942DAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942DAG-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
-; GFX942DAG-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
-; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a0, s16
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a1, s17
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a2, s18
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a3, s19
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a4, s20
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a5, s21
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a6, s22
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a7, s23
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a8, s24
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a9, s25
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a10, s26
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a11, s27
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a12, s28
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a13, s29
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a14, s30
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a15, s31
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a16, s0
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a17, s1
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a18, s2
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a19, s3
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a20, s4
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a21, s5
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a22, s6
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a23, s7
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a24, s8
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a25, s9
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a26, s10
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a27, s11
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a28, s12
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a29, s13
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a30, s14
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a31, s15
-; GFX942DAG-NEXT: s_nop 1
-; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
-; GFX942DAG-NEXT: s_nop 15
-; GFX942DAG-NEXT: s_nop 1
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
-; GFX942DAG-NEXT: s_endpgm
-;
-; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_agpr:
-; GFX942GSEL: ; %bb.0: ; %bb
-; GFX942GSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942GSEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0
-; GFX942GSEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40
-; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a16, s16
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a15, s15
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a17, s17
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a18, s18
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a19, s19
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a20, s20
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a21, s21
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a22, s22
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a23, s23
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a24, s24
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a25, s25
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a26, s26
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a27, s27
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a28, s28
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a29, s29
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a30, s30
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a31, s31
-; GFX942GSEL-NEXT: s_nop 1
-; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
-; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX942GSEL-NEXT: s_nop 15
-; GFX942GSEL-NEXT: s_nop 0
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
-; GFX942GSEL-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -763,40 +347,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
@@ -805,134 +389,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr
; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX908-NEXT: s_endpgm
-;
-; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr:
-; GFX90ADAG: ; %bb.0: ; %bb
-; GFX90ADAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90ADAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90ADAG-NEXT: ;;#ASMSTART
-; GFX90ADAG-NEXT: ; def a0
-; GFX90ADAG-NEXT: ;;#ASMEND
-; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90ADAG-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
-; GFX90ADAG-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
-; GFX90ADAG-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
-; GFX90ADAG-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
-; GFX90ADAG-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
-; GFX90ADAG-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
-; GFX90ADAG-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
-; GFX90ADAG-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
-; GFX90ADAG-NEXT: s_waitcnt vmcnt(0)
-; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GFX90ADAG-NEXT: s_nop 15
-; GFX90ADAG-NEXT: s_nop 2
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90ADAG-NEXT: s_endpgm
-;
-; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr:
-; GFX90AGSEL: ; %bb.0: ; %bb
-; GFX90AGSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90AGSEL-NEXT: ;;#ASMSTART
-; GFX90AGSEL-NEXT: ; def a0
-; GFX90AGSEL-NEXT: ;;#ASMEND
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
-; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0)
-; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GFX90AGSEL-NEXT: s_nop 15
-; GFX90AGSEL-NEXT: s_nop 2
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90AGSEL-NEXT: s_endpgm
-;
-; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr:
-; GFX942DAG: ; %bb.0: ; %bb
-; GFX942DAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942DAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX942DAG-NEXT: ;;#ASMSTART
-; GFX942DAG-NEXT: ; def a0
-; GFX942DAG-NEXT: ;;#ASMEND
-; GFX942DAG-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX942DAG-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942DAG-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
-; GFX942DAG-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
-; GFX942DAG-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
-; GFX942DAG-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
-; GFX942DAG-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
-; GFX942DAG-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
-; GFX942DAG-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
-; GFX942DAG-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
-; GFX942DAG-NEXT: s_waitcnt vmcnt(0)
-; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
-; GFX942DAG-NEXT: s_nop 15
-; GFX942DAG-NEXT: s_nop 1
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942DAG-NEXT: s_endpgm
-;
-; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr:
-; GFX942GSEL: ; %bb.0: ; %bb
-; GFX942GSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942GSEL-NEXT: ;;#ASMSTART
-; GFX942GSEL-NEXT: ; def a0
-; GFX942GSEL-NEXT: ;;#ASMEND
-; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942GSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
-; GFX942GSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
-; GFX942GSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
-; GFX942GSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
-; GFX942GSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
-; GFX942GSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
-; GFX942GSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
-; GFX942GSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
-; GFX942GSEL-NEXT: s_waitcnt vmcnt(0)
-; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
-; GFX942GSEL-NEXT: s_nop 15
-; GFX942GSEL-NEXT: s_nop 1
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942GSEL-NEXT: s_endpgm
bb:
%acc = call i32 asm sideeffect "; def $0", "={a0}"()
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -997,40 +453,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
@@ -1039,134 +495,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add
; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX908-NEXT: s_endpgm
-;
-; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr:
-; GFX90ADAG: ; %bb.0: ; %bb
-; GFX90ADAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90ADAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX90ADAG-NEXT: ;;#ASMSTART
-; GFX90ADAG-NEXT: ; use a[100:131]
-; GFX90ADAG-NEXT: ;;#ASMEND
-; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90ADAG-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
-; GFX90ADAG-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
-; GFX90ADAG-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
-; GFX90ADAG-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
-; GFX90ADAG-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
-; GFX90ADAG-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
-; GFX90ADAG-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
-; GFX90ADAG-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
-; GFX90ADAG-NEXT: s_waitcnt vmcnt(0)
-; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GFX90ADAG-NEXT: s_nop 15
-; GFX90ADAG-NEXT: s_nop 2
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GFX90ADAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90ADAG-NEXT: s_endpgm
-;
-; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr:
-; GFX90AGSEL: ; %bb.0: ; %bb
-; GFX90AGSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX90AGSEL-NEXT: ;;#ASMSTART
-; GFX90AGSEL-NEXT: ; use a[100:131]
-; GFX90AGSEL-NEXT: ;;#ASMEND
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
-; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0)
-; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GFX90AGSEL-NEXT: s_nop 15
-; GFX90AGSEL-NEXT: s_nop 2
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90AGSEL-NEXT: s_endpgm
-;
-; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr:
-; GFX942DAG: ; %bb.0: ; %bb
-; GFX942DAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942DAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX942DAG-NEXT: ;;#ASMSTART
-; GFX942DAG-NEXT: ; use a[100:131]
-; GFX942DAG-NEXT: ;;#ASMEND
-; GFX942DAG-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX942DAG-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942DAG-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
-; GFX942DAG-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
-; GFX942DAG-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
-; GFX942DAG-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
-; GFX942DAG-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
-; GFX942DAG-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
-; GFX942DAG-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
-; GFX942DAG-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
-; GFX942DAG-NEXT: s_waitcnt vmcnt(0)
-; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
-; GFX942DAG-NEXT: s_nop 15
-; GFX942DAG-NEXT: s_nop 1
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GFX942DAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942DAG-NEXT: s_endpgm
-;
-; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr:
-; GFX942GSEL: ; %bb.0: ; %bb
-; GFX942GSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX942GSEL-NEXT: ;;#ASMSTART
-; GFX942GSEL-NEXT: ; use a[100:131]
-; GFX942GSEL-NEXT: ;;#ASMEND
-; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942GSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
-; GFX942GSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
-; GFX942GSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
-; GFX942GSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
-; GFX942GSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
-; GFX942GSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
-; GFX942GSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
-; GFX942GSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
-; GFX942GSEL-NEXT: s_waitcnt vmcnt(0)
-; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
-; GFX942GSEL-NEXT: s_nop 15
-; GFX942GSEL-NEXT: s_nop 1
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942GSEL-NEXT: s_endpgm
bb:
call void asm sideeffect "; use $0", "{a[100:131]}"(<32 x float> poison)
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -1231,40 +559,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
@@ -1273,134 +601,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr
; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
; GFX908-NEXT: s_endpgm
-;
-; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs:
-; GFX90ADAG: ; %bb.0: ; %bb
-; GFX90ADAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90ADAG-NEXT: v_mov_b32_e32 v32, 0
-; GFX90ADAG-NEXT: ;;#ASMSTART
-; GFX90ADAG-NEXT: ; def v0
-; GFX90ADAG-NEXT: ;;#ASMEND
-; GFX90ADAG-NEXT: v_mov_b32_e32 v33, 1.0
-; GFX90ADAG-NEXT: v_mov_b32_e32 v34, 2.0
-; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90ADAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; GFX90ADAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GFX90ADAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
-; GFX90ADAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GFX90ADAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
-; GFX90ADAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GFX90ADAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
-; GFX90ADAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
-; GFX90ADAG-NEXT: s_waitcnt vmcnt(0)
-; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
-; GFX90ADAG-NEXT: s_nop 15
-; GFX90ADAG-NEXT: s_nop 2
-; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
-; GFX90ADAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX90ADAG-NEXT: s_endpgm
-;
-; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs:
-; GFX90AGSEL: ; %bb.0: ; %bb
-; GFX90AGSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90AGSEL-NEXT: ;;#ASMSTART
-; GFX90AGSEL-NEXT: ; def v0
-; GFX90AGSEL-NEXT: ;;#ASMEND
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v32, 0
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v33, 1.0
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v34, 2.0
-; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90AGSEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
-; GFX90AGSEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
-; GFX90AGSEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GFX90AGSEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
-; GFX90AGSEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GFX90AGSEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
-; GFX90AGSEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GFX90AGSEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0)
-; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
-; GFX90AGSEL-NEXT: s_nop 15
-; GFX90AGSEL-NEXT: s_nop 2
-; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
-; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX90AGSEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX90AGSEL-NEXT: s_endpgm
-;
-; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs:
-; GFX942DAG: ; %bb.0: ; %bb
-; GFX942DAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942DAG-NEXT: v_mov_b32_e32 v32, 0
-; GFX942DAG-NEXT: ;;#ASMSTART
-; GFX942DAG-NEXT: ; def v0
-; GFX942DAG-NEXT: ;;#ASMEND
-; GFX942DAG-NEXT: v_mov_b32_e32 v33, 1.0
-; GFX942DAG-NEXT: v_mov_b32_e32 v34, 2.0
-; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942DAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; GFX942DAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GFX942DAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
-; GFX942DAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GFX942DAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
-; GFX942DAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GFX942DAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
-; GFX942DAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
-; GFX942DAG-NEXT: s_waitcnt vmcnt(0)
-; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31]
-; GFX942DAG-NEXT: s_nop 15
-; GFX942DAG-NEXT: s_nop 1
-; GFX942DAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX942DAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX942DAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX942DAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX942DAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX942DAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX942DAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
-; GFX942DAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX942DAG-NEXT: s_endpgm
-;
-; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs:
-; GFX942GSEL: ; %bb.0: ; %bb
-; GFX942GSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942GSEL-NEXT: ;;#ASMSTART
-; GFX942GSEL-NEXT: ; def v0
-; GFX942GSEL-NEXT: ;;#ASMEND
-; GFX942GSEL-NEXT: v_mov_b32_e32 v32, 0
-; GFX942GSEL-NEXT: v_mov_b32_e32 v33, 1.0
-; GFX942GSEL-NEXT: v_mov_b32_e32 v34, 2.0
-; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942GSEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
-; GFX942GSEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
-; GFX942GSEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
-; GFX942GSEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
-; GFX942GSEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
-; GFX942GSEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
-; GFX942GSEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
-; GFX942GSEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; GFX942GSEL-NEXT: s_waitcnt vmcnt(0)
-; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31]
-; GFX942GSEL-NEXT: s_nop 15
-; GFX942GSEL-NEXT: s_nop 1
-; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
-; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; GFX942GSEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; GFX942GSEL-NEXT: s_endpgm
bb:
%acc = call i32 asm sideeffect "; def $0", "={v0}"()
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -1487,40 +687,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg)
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
; GFX908-NEXT: global_store_dwordx4 v40, v[4:7], s[34:35] offset:112
; GFX908-NEXT: global_store_dwordx4 v40, v[8:11], s[34:35] offset:64
; GFX908-NEXT: global_store_dwordx4 v40, v[12:15], s[34:35] offset:80
@@ -1529,205 +729,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg)
; GFX908-NEXT: global_store_dwordx4 v40, v[24:27], s[34:35]
; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:16
; GFX908-NEXT: s_endpgm
-;
-; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_call:
-; GFX90ADAG: ; %bb.0: ; %bb
-; GFX90ADAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX90ADAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX90ADAG-NEXT: s_mov_b32 s38, -1
-; GFX90ADAG-NEXT: s_mov_b32 s39, 0xe00000
-; GFX90ADAG-NEXT: s_add_u32 s36, s36, s11
-; GFX90ADAG-NEXT: s_addc_u32 s37, s37, 0
-; GFX90ADAG-NEXT: s_mov_b32 s12, s8
-; GFX90ADAG-NEXT: s_add_u32 s8, s4, 44
-; GFX90ADAG-NEXT: s_mov_b32 s13, s9
-; GFX90ADAG-NEXT: s_addc_u32 s9, s5, 0
-; GFX90ADAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX90ADAG-NEXT: s_getpc_b64 s[4:5]
-; GFX90ADAG-NEXT: s_add_u32 s4, s4, foo at gotpcrel32@lo+4
-; GFX90ADAG-NEXT: s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
-; GFX90ADAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX90ADAG-NEXT: s_mov_b32 s14, s10
-; GFX90ADAG-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX90ADAG-NEXT: s_mov_b64 s[4:5], s[0:1]
-; GFX90ADAG-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX90ADAG-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX90ADAG-NEXT: v_mov_b32_e32 v31, v0
-; GFX90ADAG-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX90ADAG-NEXT: s_mov_b32 s32, 0
-; GFX90ADAG-NEXT: v_mov_b32_e32 v40, 0
-; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90ADAG-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX90ADAG-NEXT: global_load_dwordx4 a[28:31], v40, s[34:35] offset:112
-; GFX90ADAG-NEXT: global_load_dwordx4 a[24:27], v40, s[34:35] offset:96
-; GFX90ADAG-NEXT: global_load_dwordx4 a[20:23], v40, s[34:35] offset:80
-; GFX90ADAG-NEXT: global_load_dwordx4 a[16:19], v40, s[34:35] offset:64
-; GFX90ADAG-NEXT: global_load_dwordx4 a[12:15], v40, s[34:35] offset:48
-; GFX90ADAG-NEXT: global_load_dwordx4 a[8:11], v40, s[34:35] offset:32
-; GFX90ADAG-NEXT: global_load_dwordx4 a[4:7], v40, s[34:35] offset:16
-; GFX90ADAG-NEXT: global_load_dwordx4 a[0:3], v40, s[34:35]
-; GFX90ADAG-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX90ADAG-NEXT: s_waitcnt vmcnt(0)
-; GFX90ADAG-NEXT: s_nop 0
-; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GFX90ADAG-NEXT: s_nop 15
-; GFX90ADAG-NEXT: s_nop 2
-; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[24:27], s[34:35] offset:96
-; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[28:31], s[34:35] offset:112
-; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[16:19], s[34:35] offset:64
-; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[20:23], s[34:35] offset:80
-; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[8:11], s[34:35] offset:32
-; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[12:15], s[34:35] offset:48
-; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[0:3], s[34:35]
-; GFX90ADAG-NEXT: global_store_dwordx4 v40, a[4:7], s[34:35] offset:16
-; GFX90ADAG-NEXT: s_endpgm
-;
-; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_call:
-; GFX90AGSEL: ; %bb.0: ; %bb
-; GFX90AGSEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX90AGSEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX90AGSEL-NEXT: s_mov_b32 s38, -1
-; GFX90AGSEL-NEXT: s_mov_b32 s39, 0xe00000
-; GFX90AGSEL-NEXT: s_add_u32 s36, s36, s11
-; GFX90AGSEL-NEXT: s_addc_u32 s37, s37, 0
-; GFX90AGSEL-NEXT: s_mov_b32 s16, s8
-; GFX90AGSEL-NEXT: s_add_u32 s8, s4, 44
-; GFX90AGSEL-NEXT: s_mov_b32 s15, s9
-; GFX90AGSEL-NEXT: s_addc_u32 s9, s5, 0
-; GFX90AGSEL-NEXT: s_mov_b64 s[12:13], s[0:1]
-; GFX90AGSEL-NEXT: s_getpc_b64 s[0:1]
-; GFX90AGSEL-NEXT: s_add_u32 s0, s0, foo at gotpcrel32@lo+4
-; GFX90AGSEL-NEXT: s_addc_u32 s1, s1, foo at gotpcrel32@hi+12
-; GFX90AGSEL-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x0
-; GFX90AGSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX90AGSEL-NEXT: s_mov_b32 s14, s10
-; GFX90AGSEL-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX90AGSEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX90AGSEL-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX90AGSEL-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX90AGSEL-NEXT: s_mov_b64 s[4:5], s[12:13]
-; GFX90AGSEL-NEXT: s_mov_b32 s12, s16
-; GFX90AGSEL-NEXT: s_mov_b32 s13, s15
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v31, v0
-; GFX90AGSEL-NEXT: s_mov_b32 s32, 0
-; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90AGSEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[34:35]
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[34:35] offset:16
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[34:35] offset:32
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[34:35] offset:48
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[34:35] offset:64
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[34:35] offset:80
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[34:35] offset:96
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[34:35] offset:112
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0)
-; GFX90AGSEL-NEXT: s_nop 0
-; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GFX90AGSEL-NEXT: s_nop 15
-; GFX90AGSEL-NEXT: s_nop 2
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
-; GFX90AGSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
-; GFX90AGSEL-NEXT: s_endpgm
-;
-; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_call:
-; GFX942DAG: ; %bb.0: ; %bb
-; GFX942DAG-NEXT: s_mov_b32 s12, s8
-; GFX942DAG-NEXT: s_add_u32 s8, s4, 44
-; GFX942DAG-NEXT: s_mov_b32 s13, s9
-; GFX942DAG-NEXT: s_addc_u32 s9, s5, 0
-; GFX942DAG-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX942DAG-NEXT: s_getpc_b64 s[4:5]
-; GFX942DAG-NEXT: s_add_u32 s4, s4, foo at gotpcrel32@lo+4
-; GFX942DAG-NEXT: s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
-; GFX942DAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX942DAG-NEXT: s_mov_b32 s14, s10
-; GFX942DAG-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX942DAG-NEXT: s_mov_b64 s[4:5], s[0:1]
-; GFX942DAG-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX942DAG-NEXT: v_mov_b32_e32 v31, v0
-; GFX942DAG-NEXT: s_mov_b32 s32, 0
-; GFX942DAG-NEXT: v_mov_b32_e32 v40, 0
-; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942DAG-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX942DAG-NEXT: global_load_dwordx4 a[28:31], v40, s[34:35] offset:112
-; GFX942DAG-NEXT: global_load_dwordx4 a[24:27], v40, s[34:35] offset:96
-; GFX942DAG-NEXT: global_load_dwordx4 a[20:23], v40, s[34:35] offset:80
-; GFX942DAG-NEXT: global_load_dwordx4 a[16:19], v40, s[34:35] offset:64
-; GFX942DAG-NEXT: global_load_dwordx4 a[12:15], v40, s[34:35] offset:48
-; GFX942DAG-NEXT: global_load_dwordx4 a[8:11], v40, s[34:35] offset:32
-; GFX942DAG-NEXT: global_load_dwordx4 a[4:7], v40, s[34:35] offset:16
-; GFX942DAG-NEXT: global_load_dwordx4 a[0:3], v40, s[34:35]
-; GFX942DAG-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942DAG-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX942DAG-NEXT: s_waitcnt vmcnt(0)
-; GFX942DAG-NEXT: s_nop 0
-; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
-; GFX942DAG-NEXT: s_nop 15
-; GFX942DAG-NEXT: s_nop 1
-; GFX942DAG-NEXT: global_store_dwordx4 v40, a[24:27], s[34:35] offset:96
-; GFX942DAG-NEXT: global_store_dwordx4 v40, a[28:31], s[34:35] offset:112
-; GFX942DAG-NEXT: global_store_dwordx4 v40, a[16:19], s[34:35] offset:64
-; GFX942DAG-NEXT: global_store_dwordx4 v40, a[20:23], s[34:35] offset:80
-; GFX942DAG-NEXT: global_store_dwordx4 v40, a[8:11], s[34:35] offset:32
-; GFX942DAG-NEXT: global_store_dwordx4 v40, a[12:15], s[34:35] offset:48
-; GFX942DAG-NEXT: global_store_dwordx4 v40, a[0:3], s[34:35]
-; GFX942DAG-NEXT: global_store_dwordx4 v40, a[4:7], s[34:35] offset:16
-; GFX942DAG-NEXT: s_endpgm
-;
-; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_call:
-; GFX942GSEL: ; %bb.0: ; %bb
-; GFX942GSEL-NEXT: s_mov_b32 s12, s8
-; GFX942GSEL-NEXT: s_add_u32 s8, s4, 44
-; GFX942GSEL-NEXT: s_mov_b32 s13, s9
-; GFX942GSEL-NEXT: s_addc_u32 s9, s5, 0
-; GFX942GSEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX942GSEL-NEXT: s_getpc_b64 s[4:5]
-; GFX942GSEL-NEXT: s_add_u32 s4, s4, foo at gotpcrel32@lo+4
-; GFX942GSEL-NEXT: s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
-; GFX942GSEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX942GSEL-NEXT: s_mov_b32 s14, s10
-; GFX942GSEL-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX942GSEL-NEXT: s_mov_b64 s[4:5], s[0:1]
-; GFX942GSEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX942GSEL-NEXT: v_mov_b32_e32 v31, v0
-; GFX942GSEL-NEXT: s_mov_b32 s32, 0
-; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942GSEL-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX942GSEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX942GSEL-NEXT: global_load_dwordx4 a[0:3], v0, s[34:35]
-; GFX942GSEL-NEXT: global_load_dwordx4 a[4:7], v0, s[34:35] offset:16
-; GFX942GSEL-NEXT: global_load_dwordx4 a[8:11], v0, s[34:35] offset:32
-; GFX942GSEL-NEXT: global_load_dwordx4 a[12:15], v0, s[34:35] offset:48
-; GFX942GSEL-NEXT: global_load_dwordx4 a[16:19], v0, s[34:35] offset:64
-; GFX942GSEL-NEXT: global_load_dwordx4 a[20:23], v0, s[34:35] offset:80
-; GFX942GSEL-NEXT: global_load_dwordx4 a[24:27], v0, s[34:35] offset:96
-; GFX942GSEL-NEXT: global_load_dwordx4 a[28:31], v0, s[34:35] offset:112
-; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX942GSEL-NEXT: s_waitcnt vmcnt(0)
-; GFX942GSEL-NEXT: s_nop 0
-; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
-; GFX942GSEL-NEXT: s_nop 15
-; GFX942GSEL-NEXT: s_nop 1
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
-; GFX942GSEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
-; GFX942GSEL-NEXT: s_endpgm
bb:
call void @foo()
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -1830,59 +831,59 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v6, v3, a[0:31] cbsz:1 abid:2 blgp:3
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a26
; GFX908-NEXT: v_accvgpr_read_b32 v6, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a24
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a30
; GFX908-NEXT: v_accvgpr_read_b32 v6, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a28
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:112
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a18
; GFX908-NEXT: v_accvgpr_read_b32 v6, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a16
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:64
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a22
; GFX908-NEXT: v_accvgpr_read_b32 v6, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a20
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:80
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a10
; GFX908-NEXT: v_accvgpr_read_b32 v6, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a8
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:32
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a14
; GFX908-NEXT: v_accvgpr_read_b32 v6, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a12
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:48
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
; GFX908-NEXT: v_accvgpr_read_b32 v6, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7]
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a6
; GFX908-NEXT: v_accvgpr_read_b32 v6, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a4
; GFX908-NEXT: s_nop 1
; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:16
; GFX908-NEXT: s_cbranch_scc1 .LBB6_2
@@ -1905,331 +906,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX908-NEXT: .LBB6_2: ; %bb3
; GFX908-NEXT: s_endpgm
-;
-; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb:
-; GFX90ADAG: ; %bb.0: ; %bb1
-; GFX90ADAG-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0
-; GFX90ADAG-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
-; GFX90ADAG-NEXT: s_mov_b32 s54, -1
-; GFX90ADAG-NEXT: s_mov_b32 s55, 0xe00000
-; GFX90ADAG-NEXT: s_add_u32 s52, s52, s11
-; GFX90ADAG-NEXT: s_mov_b32 s14, s10
-; GFX90ADAG-NEXT: s_mov_b32 s12, s8
-; GFX90ADAG-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX90ADAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90ADAG-NEXT: s_load_dword s8, s[4:5], 0x2c
-; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 1.0
-; GFX90ADAG-NEXT: v_mov_b32_e32 v3, 2.0
-; GFX90ADAG-NEXT: s_addc_u32 s53, s53, 0
-; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90ADAG-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0
-; GFX90ADAG-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x40
-; GFX90ADAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX90ADAG-NEXT: s_bitcmp0_b32 s8, 0
-; GFX90ADAG-NEXT: s_mov_b32 s32, 0
-; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a0, s36
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a1, s37
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a2, s38
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a3, s39
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a4, s40
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a5, s41
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a6, s42
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a7, s43
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a8, s44
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a9, s45
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a10, s46
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a11, s47
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a12, s48
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a13, s49
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a14, s50
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a15, s51
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a16, s16
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a17, s17
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a18, s18
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a19, s19
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a20, s20
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a21, s21
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a22, s22
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a23, s23
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a24, s24
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a25, s25
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a26, s26
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a27, s27
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a28, s28
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a29, s29
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a30, s30
-; GFX90ADAG-NEXT: v_accvgpr_write_b32 a31, s31
-; GFX90ADAG-NEXT: s_nop 1
-; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX90ADAG-NEXT: s_nop 15
-; GFX90ADAG-NEXT: s_nop 2
-; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[24:27], s[6:7] offset:96
-; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[28:31], s[6:7] offset:112
-; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[16:19], s[6:7] offset:64
-; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[20:23], s[6:7] offset:80
-; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[8:11], s[6:7] offset:32
-; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[12:15], s[6:7] offset:48
-; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
-; GFX90ADAG-NEXT: global_store_dwordx4 v1, a[4:7], s[6:7] offset:16
-; GFX90ADAG-NEXT: s_cbranch_scc1 .LBB6_2
-; GFX90ADAG-NEXT: ; %bb.1: ; %bb2
-; GFX90ADAG-NEXT: s_add_u32 s8, s4, 48
-; GFX90ADAG-NEXT: s_mov_b32 s13, s9
-; GFX90ADAG-NEXT: s_addc_u32 s9, s5, 0
-; GFX90ADAG-NEXT: s_getpc_b64 s[4:5]
-; GFX90ADAG-NEXT: s_add_u32 s4, s4, foo at gotpcrel32@lo+4
-; GFX90ADAG-NEXT: s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
-; GFX90ADAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX90ADAG-NEXT: s_mov_b64 s[4:5], s[0:1]
-; GFX90ADAG-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX90ADAG-NEXT: s_mov_b64 s[0:1], s[52:53]
-; GFX90ADAG-NEXT: v_mov_b32_e32 v31, v0
-; GFX90ADAG-NEXT: s_mov_b64 s[2:3], s[54:55]
-; GFX90ADAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90ADAG-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX90ADAG-NEXT: .LBB6_2: ; %bb3
-; GFX90ADAG-NEXT: s_endpgm
-;
-; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb:
-; GFX90AGSEL: ; %bb.0: ; %bb1
-; GFX90AGSEL-NEXT: s_mov_b32 s68, SCRATCH_RSRC_DWORD0
-; GFX90AGSEL-NEXT: s_mov_b32 s69, SCRATCH_RSRC_DWORD1
-; GFX90AGSEL-NEXT: s_mov_b32 s70, -1
-; GFX90AGSEL-NEXT: s_mov_b32 s71, 0xe00000
-; GFX90AGSEL-NEXT: s_add_u32 s68, s68, s11
-; GFX90AGSEL-NEXT: s_mov_b32 s14, s10
-; GFX90AGSEL-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX90AGSEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX90AGSEL-NEXT: s_mov_b64 s[16:17], s[0:1]
-; GFX90AGSEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90AGSEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90AGSEL-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0
-; GFX90AGSEL-NEXT: s_load_dwordx16 s[52:67], s[0:1], 0x40
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX90AGSEL-NEXT: s_addc_u32 s69, s69, 0
-; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a0, s36
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a16, s52
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a1, s37
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a2, s38
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a3, s39
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a4, s40
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a5, s41
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a6, s42
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a7, s43
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a8, s44
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a9, s45
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a10, s46
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a11, s47
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a12, s48
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a13, s49
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a14, s50
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a15, s51
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a17, s53
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a18, s54
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a19, s55
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a20, s56
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a21, s57
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a22, s58
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a23, s59
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a24, s60
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a25, s61
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a26, s62
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a27, s63
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a28, s64
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a29, s65
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a30, s66
-; GFX90AGSEL-NEXT: v_accvgpr_write_b32 a31, s67
-; GFX90AGSEL-NEXT: s_xor_b32 s2, s2, 1
-; GFX90AGSEL-NEXT: s_and_b32 s2, s2, 1
-; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX90AGSEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX90AGSEL-NEXT: s_mov_b32 s32, 0
-; GFX90AGSEL-NEXT: s_nop 15
-; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
-; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[4:7], s[0:1] offset:16
-; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[8:11], s[0:1] offset:32
-; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[12:15], s[0:1] offset:48
-; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[16:19], s[0:1] offset:64
-; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[20:23], s[0:1] offset:80
-; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[24:27], s[0:1] offset:96
-; GFX90AGSEL-NEXT: global_store_dwordx4 v1, a[28:31], s[0:1] offset:112
-; GFX90AGSEL-NEXT: s_cbranch_scc1 .LBB6_2
-; GFX90AGSEL-NEXT: ; %bb.1: ; %bb2
-; GFX90AGSEL-NEXT: s_getpc_b64 s[0:1]
-; GFX90AGSEL-NEXT: s_add_u32 s0, s0, foo at gotpcrel32@lo+4
-; GFX90AGSEL-NEXT: s_addc_u32 s1, s1, foo at gotpcrel32@hi+12
-; GFX90AGSEL-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x0
-; GFX90AGSEL-NEXT: s_mov_b32 s12, s8
-; GFX90AGSEL-NEXT: s_add_u32 s8, s4, 48
-; GFX90AGSEL-NEXT: s_mov_b64 s[0:1], s[68:69]
-; GFX90AGSEL-NEXT: s_mov_b32 s13, s9
-; GFX90AGSEL-NEXT: s_addc_u32 s9, s5, 0
-; GFX90AGSEL-NEXT: s_mov_b64 s[2:3], s[70:71]
-; GFX90AGSEL-NEXT: s_mov_b64 s[4:5], s[16:17]
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v31, v0
-; GFX90AGSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90AGSEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX90AGSEL-NEXT: .LBB6_2: ; %bb3
-; GFX90AGSEL-NEXT: s_endpgm
-;
-; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb:
-; GFX942DAG: ; %bb.0: ; %bb1
-; GFX942DAG-NEXT: s_mov_b32 s14, s10
-; GFX942DAG-NEXT: s_mov_b32 s12, s8
-; GFX942DAG-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX942DAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942DAG-NEXT: s_load_dword s8, s[4:5], 0x2c
-; GFX942DAG-NEXT: v_mov_b32_e32 v2, 1.0
-; GFX942DAG-NEXT: v_mov_b32_e32 v3, 2.0
-; GFX942DAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942DAG-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0
-; GFX942DAG-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x40
-; GFX942DAG-NEXT: s_bitcmp0_b32 s8, 0
-; GFX942DAG-NEXT: s_mov_b32 s32, 0
-; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a0, s36
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a1, s37
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a2, s38
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a3, s39
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a4, s40
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a5, s41
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a6, s42
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a7, s43
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a8, s44
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a9, s45
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a10, s46
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a11, s47
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a12, s48
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a13, s49
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a14, s50
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a15, s51
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a16, s16
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a17, s17
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a18, s18
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a19, s19
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a20, s20
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a21, s21
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a22, s22
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a23, s23
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a24, s24
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a25, s25
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a26, s26
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a27, s27
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a28, s28
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a29, s29
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a30, s30
-; GFX942DAG-NEXT: v_accvgpr_write_b32 a31, s31
-; GFX942DAG-NEXT: s_nop 1
-; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v2, v3, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX942DAG-NEXT: s_nop 15
-; GFX942DAG-NEXT: s_nop 1
-; GFX942DAG-NEXT: global_store_dwordx4 v1, a[24:27], s[6:7] offset:96
-; GFX942DAG-NEXT: global_store_dwordx4 v1, a[28:31], s[6:7] offset:112
-; GFX942DAG-NEXT: global_store_dwordx4 v1, a[16:19], s[6:7] offset:64
-; GFX942DAG-NEXT: global_store_dwordx4 v1, a[20:23], s[6:7] offset:80
-; GFX942DAG-NEXT: global_store_dwordx4 v1, a[8:11], s[6:7] offset:32
-; GFX942DAG-NEXT: global_store_dwordx4 v1, a[12:15], s[6:7] offset:48
-; GFX942DAG-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
-; GFX942DAG-NEXT: global_store_dwordx4 v1, a[4:7], s[6:7] offset:16
-; GFX942DAG-NEXT: s_cbranch_scc1 .LBB6_2
-; GFX942DAG-NEXT: ; %bb.1: ; %bb2
-; GFX942DAG-NEXT: s_add_u32 s8, s4, 48
-; GFX942DAG-NEXT: s_mov_b32 s13, s9
-; GFX942DAG-NEXT: s_addc_u32 s9, s5, 0
-; GFX942DAG-NEXT: s_getpc_b64 s[4:5]
-; GFX942DAG-NEXT: s_add_u32 s4, s4, foo at gotpcrel32@lo+4
-; GFX942DAG-NEXT: s_addc_u32 s5, s5, foo at gotpcrel32@hi+12
-; GFX942DAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX942DAG-NEXT: s_mov_b64 s[4:5], s[0:1]
-; GFX942DAG-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX942DAG-NEXT: v_mov_b32_e32 v31, v0
-; GFX942DAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942DAG-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX942DAG-NEXT: .LBB6_2: ; %bb3
-; GFX942DAG-NEXT: s_endpgm
-;
-; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb:
-; GFX942GSEL: ; %bb.0: ; %bb1
-; GFX942GSEL-NEXT: s_mov_b32 s14, s10
-; GFX942GSEL-NEXT: s_mov_b32 s12, s8
-; GFX942GSEL-NEXT: s_mov_b64 s[10:11], s[6:7]
-; GFX942GSEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942GSEL-NEXT: s_load_dword s8, s[4:5], 0x2c
-; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942GSEL-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x0
-; GFX942GSEL-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x40
-; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX942GSEL-NEXT: s_xor_b32 s8, s8, 1
-; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a0, s16
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a16, s36
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a1, s17
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a2, s18
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a3, s19
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a4, s20
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a5, s21
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a6, s22
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a7, s23
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a8, s24
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a9, s25
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a10, s26
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a11, s27
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a12, s28
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a13, s29
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a14, s30
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a15, s31
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a17, s37
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a18, s38
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a19, s39
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a20, s40
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a21, s41
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a22, s42
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a23, s43
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a24, s44
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a25, s45
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a26, s46
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a27, s47
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a28, s48
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a29, s49
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a30, s50
-; GFX942GSEL-NEXT: v_accvgpr_write_b32 a31, s51
-; GFX942GSEL-NEXT: s_and_b32 s8, s8, 1
-; GFX942GSEL-NEXT: s_cmp_lg_u32 s8, 0
-; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
-; GFX942GSEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX942GSEL-NEXT: s_mov_b32 s32, 0
-; GFX942GSEL-NEXT: s_nop 15
-; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
-; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[4:7], s[6:7] offset:16
-; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[8:11], s[6:7] offset:32
-; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[12:15], s[6:7] offset:48
-; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[16:19], s[6:7] offset:64
-; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[20:23], s[6:7] offset:80
-; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[24:27], s[6:7] offset:96
-; GFX942GSEL-NEXT: global_store_dwordx4 v1, a[28:31], s[6:7] offset:112
-; GFX942GSEL-NEXT: s_cbranch_scc1 .LBB6_2
-; GFX942GSEL-NEXT: ; %bb.1: ; %bb2
-; GFX942GSEL-NEXT: s_getpc_b64 s[6:7]
-; GFX942GSEL-NEXT: s_add_u32 s6, s6, foo at gotpcrel32@lo+4
-; GFX942GSEL-NEXT: s_addc_u32 s7, s7, foo at gotpcrel32@hi+12
-; GFX942GSEL-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GFX942GSEL-NEXT: s_add_u32 s8, s4, 48
-; GFX942GSEL-NEXT: s_mov_b32 s13, s9
-; GFX942GSEL-NEXT: s_addc_u32 s9, s5, 0
-; GFX942GSEL-NEXT: s_mov_b64 s[4:5], s[0:1]
-; GFX942GSEL-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX942GSEL-NEXT: v_mov_b32_e32 v31, v0
-; GFX942GSEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942GSEL-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX942GSEL-NEXT: .LBB6_2: ; %bb3
-; GFX942GSEL-NEXT: s_endpgm
bb1:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
@@ -2296,40 +972,40 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
; GFX908-NEXT: v_accvgpr_read_b32 v5, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a24
; GFX908-NEXT: v_accvgpr_read_b32 v9, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a28
; GFX908-NEXT: v_accvgpr_read_b32 v13, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a16
; GFX908-NEXT: v_accvgpr_read_b32 v17, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a20
; GFX908-NEXT: v_accvgpr_read_b32 v21, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a8
; GFX908-NEXT: v_accvgpr_read_b32 v25, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a12
; GFX908-NEXT: v_accvgpr_read_b32 v29, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a0
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a6
; GFX908-NEXT: v_accvgpr_read_b32 v5, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a4
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80
@@ -2339,122 +1015,6 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr:
-; GFX90ADAG: ; %bb.0: ; %bb
-; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90ADAG-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
-; GFX90ADAG-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
-; GFX90ADAG-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
-; GFX90ADAG-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
-; GFX90ADAG-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
-; GFX90ADAG-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
-; GFX90ADAG-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
-; GFX90ADAG-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
-; GFX90ADAG-NEXT: v_mov_b32_e32 v34, 1.0
-; GFX90ADAG-NEXT: v_mov_b32_e32 v35, 2.0
-; GFX90ADAG-NEXT: s_waitcnt vmcnt(0)
-; GFX90ADAG-NEXT: s_nop 0
-; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v34, v35, v[2:33]
-; GFX90ADAG-NEXT: s_nop 15
-; GFX90ADAG-NEXT: s_nop 2
-; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[26:29], off offset:96
-; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:112
-; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:64
-; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:80
-; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32
-; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48
-; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
-; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
-; GFX90ADAG-NEXT: s_waitcnt vmcnt(0)
-; GFX90ADAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr:
-; GFX90AGSEL: ; %bb.0: ; %bb
-; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90AGSEL-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
-; GFX90AGSEL-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
-; GFX90AGSEL-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
-; GFX90AGSEL-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
-; GFX90AGSEL-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
-; GFX90AGSEL-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
-; GFX90AGSEL-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
-; GFX90AGSEL-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v34, 1.0
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v35, 2.0
-; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0)
-; GFX90AGSEL-NEXT: s_nop 0
-; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v34, v35, v[2:33]
-; GFX90AGSEL-NEXT: s_nop 15
-; GFX90AGSEL-NEXT: s_nop 2
-; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
-; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
-; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32
-; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48
-; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:64
-; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:80
-; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[26:29], off offset:96
-; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:112
-; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0)
-; GFX90AGSEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr:
-; GFX942DAG: ; %bb.0: ; %bb
-; GFX942DAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942DAG-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
-; GFX942DAG-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
-; GFX942DAG-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
-; GFX942DAG-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
-; GFX942DAG-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
-; GFX942DAG-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
-; GFX942DAG-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
-; GFX942DAG-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
-; GFX942DAG-NEXT: v_mov_b32_e32 v34, 1.0
-; GFX942DAG-NEXT: v_mov_b32_e32 v35, 2.0
-; GFX942DAG-NEXT: s_waitcnt vmcnt(0)
-; GFX942DAG-NEXT: s_nop 0
-; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v34, v35, v[2:33]
-; GFX942DAG-NEXT: s_nop 15
-; GFX942DAG-NEXT: s_nop 1
-; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[26:29], off offset:96
-; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:112
-; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:64
-; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:80
-; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32
-; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48
-; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
-; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
-; GFX942DAG-NEXT: s_waitcnt vmcnt(0)
-; GFX942DAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr:
-; GFX942GSEL: ; %bb.0: ; %bb
-; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942GSEL-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
-; GFX942GSEL-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
-; GFX942GSEL-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
-; GFX942GSEL-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
-; GFX942GSEL-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
-; GFX942GSEL-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
-; GFX942GSEL-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
-; GFX942GSEL-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
-; GFX942GSEL-NEXT: v_mov_b32_e32 v34, 1.0
-; GFX942GSEL-NEXT: v_mov_b32_e32 v35, 2.0
-; GFX942GSEL-NEXT: s_waitcnt vmcnt(0)
-; GFX942GSEL-NEXT: s_nop 0
-; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v34, v35, v[2:33]
-; GFX942GSEL-NEXT: s_nop 15
-; GFX942GSEL-NEXT: s_nop 1
-; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
-; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
-; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32
-; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48
-; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:64
-; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:80
-; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[26:29], off offset:96
-; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:112
-; GFX942GSEL-NEXT: s_waitcnt vmcnt(0)
-; GFX942GSEL-NEXT: s_setpc_b64 s[30:31]
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -2513,40 +1073,40 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg)
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
; GFX908-NEXT: s_nop 15
; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
; GFX908-NEXT: v_accvgpr_read_b32 v5, a27
-; GFX908-NEXT: v_accvgpr_read_b32 v6, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v7, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v8, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a24
; GFX908-NEXT: v_accvgpr_read_b32 v9, a31
-; GFX908-NEXT: v_accvgpr_read_b32 v10, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v11, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v12, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a28
; GFX908-NEXT: v_accvgpr_read_b32 v13, a19
-; GFX908-NEXT: v_accvgpr_read_b32 v14, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v15, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v16, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a16
; GFX908-NEXT: v_accvgpr_read_b32 v17, a23
-; GFX908-NEXT: v_accvgpr_read_b32 v18, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v19, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v20, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a20
; GFX908-NEXT: v_accvgpr_read_b32 v21, a11
-; GFX908-NEXT: v_accvgpr_read_b32 v22, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v23, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v24, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a8
; GFX908-NEXT: v_accvgpr_read_b32 v25, a15
-; GFX908-NEXT: v_accvgpr_read_b32 v26, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v27, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v28, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a12
; GFX908-NEXT: v_accvgpr_read_b32 v29, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a0
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96
; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v4, a6
; GFX908-NEXT: v_accvgpr_read_b32 v5, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a4
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80
@@ -2556,122 +1116,6 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg)
; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX90ADAG-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr:
-; GFX90ADAG: ; %bb.0: ; %bb
-; GFX90ADAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90ADAG-NEXT: global_load_dwordx4 a[28:31], v[0:1], off offset:112
-; GFX90ADAG-NEXT: global_load_dwordx4 a[24:27], v[0:1], off offset:96
-; GFX90ADAG-NEXT: global_load_dwordx4 a[20:23], v[0:1], off offset:80
-; GFX90ADAG-NEXT: global_load_dwordx4 a[16:19], v[0:1], off offset:64
-; GFX90ADAG-NEXT: global_load_dwordx4 a[12:15], v[0:1], off offset:48
-; GFX90ADAG-NEXT: global_load_dwordx4 a[8:11], v[0:1], off offset:32
-; GFX90ADAG-NEXT: global_load_dwordx4 a[4:7], v[0:1], off offset:16
-; GFX90ADAG-NEXT: global_load_dwordx4 a[0:3], v[0:1], off
-; GFX90ADAG-NEXT: v_mov_b32_e32 v2, 1.0
-; GFX90ADAG-NEXT: v_mov_b32_e32 v3, 2.0
-; GFX90ADAG-NEXT: s_waitcnt vmcnt(0)
-; GFX90ADAG-NEXT: s_nop 0
-; GFX90ADAG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GFX90ADAG-NEXT: s_nop 15
-; GFX90ADAG-NEXT: s_nop 2
-; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[24:27], off offset:96
-; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off offset:112
-; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[16:19], off offset:64
-; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[20:23], off offset:80
-; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[8:11], off offset:32
-; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off offset:48
-; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[0:3], off
-; GFX90ADAG-NEXT: global_store_dwordx4 v[0:1], a[4:7], off offset:16
-; GFX90ADAG-NEXT: s_waitcnt vmcnt(0)
-; GFX90ADAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX90AGSEL-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr:
-; GFX90AGSEL: ; %bb.0: ; %bb
-; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[0:3], v[0:1], off
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[4:7], v[0:1], off offset:16
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[8:11], v[0:1], off offset:32
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[12:15], v[0:1], off offset:48
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[16:19], v[0:1], off offset:64
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[20:23], v[0:1], off offset:80
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[24:27], v[0:1], off offset:96
-; GFX90AGSEL-NEXT: global_load_dwordx4 a[28:31], v[0:1], off offset:112
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v2, 1.0
-; GFX90AGSEL-NEXT: v_mov_b32_e32 v3, 2.0
-; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0)
-; GFX90AGSEL-NEXT: s_nop 0
-; GFX90AGSEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GFX90AGSEL-NEXT: s_nop 15
-; GFX90AGSEL-NEXT: s_nop 2
-; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[0:3], off
-; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[4:7], off offset:16
-; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[8:11], off offset:32
-; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[12:15], off offset:48
-; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off offset:64
-; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[20:23], off offset:80
-; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[24:27], off offset:96
-; GFX90AGSEL-NEXT: global_store_dwordx4 v[0:1], a[28:31], off offset:112
-; GFX90AGSEL-NEXT: s_waitcnt vmcnt(0)
-; GFX90AGSEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942DAG-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr:
-; GFX942DAG: ; %bb.0: ; %bb
-; GFX942DAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942DAG-NEXT: global_load_dwordx4 a[28:31], v[0:1], off offset:112
-; GFX942DAG-NEXT: global_load_dwordx4 a[24:27], v[0:1], off offset:96
-; GFX942DAG-NEXT: global_load_dwordx4 a[20:23], v[0:1], off offset:80
-; GFX942DAG-NEXT: global_load_dwordx4 a[16:19], v[0:1], off offset:64
-; GFX942DAG-NEXT: global_load_dwordx4 a[12:15], v[0:1], off offset:48
-; GFX942DAG-NEXT: global_load_dwordx4 a[8:11], v[0:1], off offset:32
-; GFX942DAG-NEXT: global_load_dwordx4 a[4:7], v[0:1], off offset:16
-; GFX942DAG-NEXT: global_load_dwordx4 a[0:3], v[0:1], off
-; GFX942DAG-NEXT: v_mov_b32_e32 v2, 1.0
-; GFX942DAG-NEXT: v_mov_b32_e32 v3, 2.0
-; GFX942DAG-NEXT: s_waitcnt vmcnt(0)
-; GFX942DAG-NEXT: s_nop 0
-; GFX942DAG-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v2, v3, a[0:31]
-; GFX942DAG-NEXT: s_nop 15
-; GFX942DAG-NEXT: s_nop 1
-; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[24:27], off offset:96
-; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off offset:112
-; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[16:19], off offset:64
-; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[20:23], off offset:80
-; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[8:11], off offset:32
-; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off offset:48
-; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[0:3], off
-; GFX942DAG-NEXT: global_store_dwordx4 v[0:1], a[4:7], off offset:16
-; GFX942DAG-NEXT: s_waitcnt vmcnt(0)
-; GFX942DAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942GSEL-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr:
-; GFX942GSEL: ; %bb.0: ; %bb
-; GFX942GSEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942GSEL-NEXT: global_load_dwordx4 a[0:3], v[0:1], off
-; GFX942GSEL-NEXT: global_load_dwordx4 a[4:7], v[0:1], off offset:16
-; GFX942GSEL-NEXT: global_load_dwordx4 a[8:11], v[0:1], off offset:32
-; GFX942GSEL-NEXT: global_load_dwordx4 a[12:15], v[0:1], off offset:48
-; GFX942GSEL-NEXT: global_load_dwordx4 a[16:19], v[0:1], off offset:64
-; GFX942GSEL-NEXT: global_load_dwordx4 a[20:23], v[0:1], off offset:80
-; GFX942GSEL-NEXT: global_load_dwordx4 a[24:27], v[0:1], off offset:96
-; GFX942GSEL-NEXT: global_load_dwordx4 a[28:31], v[0:1], off offset:112
-; GFX942GSEL-NEXT: v_mov_b32_e32 v2, 1.0
-; GFX942GSEL-NEXT: v_mov_b32_e32 v3, 2.0
-; GFX942GSEL-NEXT: s_waitcnt vmcnt(0)
-; GFX942GSEL-NEXT: s_nop 0
-; GFX942GSEL-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v2, v3, a[0:31]
-; GFX942GSEL-NEXT: s_nop 15
-; GFX942GSEL-NEXT: s_nop 1
-; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[0:3], off
-; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[4:7], off offset:16
-; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[8:11], off offset:32
-; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[12:15], off offset:48
-; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[16:19], off offset:64
-; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[20:23], off offset:80
-; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[24:27], off offset:96
-; GFX942GSEL-NEXT: global_store_dwordx4 v[0:1], a[28:31], off offset:112
-; GFX942GSEL-NEXT: s_waitcnt vmcnt(0)
-; GFX942GSEL-NEXT: s_setpc_b64 s[30:31]
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -2686,5 +1130,5 @@ attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2
attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0" }
attributes #3 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
; GFX90A: {{.*}}
-; GFX942: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index d444db8cd1bdf..0af655dfbbee9 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -54,64 +54,49 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: s_cbranch_scc1 .LBB0_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: v_mov_b32_e32 v4, 0
-; GFX908-NEXT: s_nop 12
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_zeroinit:
@@ -300,64 +285,49 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX908-NEXT: s_cbranch_scc1 .LBB1_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: v_mov_b32_e32 v4, 0
-; GFX908-NEXT: s_nop 12
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_unfoldable_splat:
@@ -542,69 +512,53 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: s_cbranch_scc1 .LBB2_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: v_mov_b32_e32 v4, 0
-; GFX908-NEXT: s_nop 12
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_non_splat:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
@@ -638,6 +592,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -663,7 +618,6 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
;
; GFX942-LABEL: test_mfma_loop_non_splat:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
@@ -697,6 +651,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -819,64 +774,49 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
; GFX908-NEXT: s_cbranch_scc1 .LBB3_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: v_mov_b32_e32 v4, 0
-; GFX908-NEXT: s_nop 12
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_unfoldable_seq:
@@ -1079,179 +1019,133 @@ exit:
define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
; GFX908-LABEL: test_mfma_loop_vgpr_init:
; GFX908: ; %bb.0: ; %entry
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v0
; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: s_mov_b32 s0, 16
; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
; GFX908-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
-; GFX908-NEXT: v_accvgpr_write_b32 a1, v2
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX908-NEXT: v_accvgpr_write_b32 a2, v3
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
-; GFX908-NEXT: v_accvgpr_write_b32 a4, v2
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX908-NEXT: v_accvgpr_write_b32 a5, v3
-; GFX908-NEXT: v_accvgpr_write_b32 a6, v5
-; GFX908-NEXT: v_accvgpr_write_b32 a7, v2
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX908-NEXT: v_accvgpr_write_b32 a8, v3
-; GFX908-NEXT: v_accvgpr_write_b32 a9, v5
-; GFX908-NEXT: v_accvgpr_write_b32 a10, v2
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX908-NEXT: v_accvgpr_write_b32 a11, v3
-; GFX908-NEXT: v_accvgpr_write_b32 a12, v5
-; GFX908-NEXT: v_accvgpr_write_b32 a13, v2
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX908-NEXT: v_accvgpr_write_b32 a14, v3
-; GFX908-NEXT: v_accvgpr_write_b32 a15, v5
-; GFX908-NEXT: v_accvgpr_write_b32 a16, v2
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX908-NEXT: v_accvgpr_write_b32 a17, v3
-; GFX908-NEXT: v_accvgpr_write_b32 a18, v5
-; GFX908-NEXT: v_accvgpr_write_b32 a19, v2
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX908-NEXT: v_accvgpr_write_b32 a20, v3
-; GFX908-NEXT: v_accvgpr_write_b32 a21, v5
-; GFX908-NEXT: v_accvgpr_write_b32 a22, v2
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX908-NEXT: v_accvgpr_write_b32 a23, v3
-; GFX908-NEXT: v_accvgpr_write_b32 a24, v5
-; GFX908-NEXT: v_accvgpr_write_b32 a25, v2
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX908-NEXT: v_accvgpr_write_b32 a26, v3
-; GFX908-NEXT: v_accvgpr_write_b32 a27, v5
-; GFX908-NEXT: v_accvgpr_write_b32 a28, v2
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX908-NEXT: v_accvgpr_write_b32 a29, v3
-; GFX908-NEXT: v_accvgpr_write_b32 a30, v5
-; GFX908-NEXT: v_accvgpr_write_b32 a31, v2
; GFX908-NEXT: .LBB4_1: ; %for.cond.preheader
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: s_nop 1
; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GFX908-NEXT: s_add_i32 s0, s0, -1
; GFX908-NEXT: s_cmp_lg_u32 s0, 0
; GFX908-NEXT: s_cbranch_scc1 .LBB4_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: v_mov_b32_e32 v4, 0
-; GFX908-NEXT: s_nop 12
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_vgpr_init:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_mov_b32 s0, 16
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v2
; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_nop 1
@@ -1276,42 +1170,42 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
;
; GFX942-LABEL: test_mfma_loop_vgpr_init:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: s_mov_b32 s0, 16
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v2
-; GFX942-NEXT: v_accvgpr_write_b32 a31, v2
; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_nop 1
@@ -1435,105 +1329,91 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX908-NEXT: s_cbranch_scc1 .LBB5_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: v_mov_b32_e32 v4, 0
-; GFX908-NEXT: s_nop 12
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_sgpr_init:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0
+; GFX90A-NEXT: v_mov_b32_e32 v0, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_nop 1
@@ -1560,41 +1440,42 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s1
-; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0
; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_nop 1
@@ -1715,72 +1596,60 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX908-NEXT: s_cbranch_scc1 .LBB6_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: v_mov_b32_e32 v4, 0
-; GFX908-NEXT: s_nop 12
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_mixed_init:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
-; GFX90A-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v0, s1
; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
@@ -1810,11 +1679,9 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_nop 1
@@ -1840,9 +1707,12 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX942-LABEL: test_mfma_loop_mixed_init:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
-; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v0, s1
; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
@@ -1872,11 +1742,9 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v2
; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_nop 1
@@ -1967,64 +1835,49 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar
; GFX908-NEXT: s_cbranch_scc1 .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: v_mov_b32_e32 v4, 0
-; GFX908-NEXT: s_nop 12
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_mfma_forward_init:
@@ -2187,64 +2040,49 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX908-NEXT: s_cbranch_scc1 .LBB8_1
; GFX908-NEXT: ; %bb.2: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: v_mov_b32_e32 v4, 0
-; GFX908-NEXT: s_nop 12
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: s_nop 13
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_loop_agpr_init:
@@ -2481,64 +2319,49 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX908-NEXT: s_cbranch_scc1 .LBB9_1
; GFX908-NEXT: ; %bb.4: ; %exit
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX908-NEXT: v_mov_b32_e32 v4, 0
-; GFX908-NEXT: s_nop 9
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
-; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
-; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: s_nop 10
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: test_mfma_nested_loop_zeroinit:
@@ -2999,8 +2822,8 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 {
; GFX908-NEXT: v_accvgpr_write_b32 a3, 0
; GFX908-NEXT: v_accvgpr_write_b32 a2, 0
; GFX908-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
; GFX908-NEXT: s_mov_b32 s4, 16
+; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
; GFX908-NEXT: .LBB11_1: ; %for.cond.preheader
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3048,7 +2871,6 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 {
; GFX90A-LABEL: test_mfma_loop_non_splat_ret_use:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
@@ -3082,6 +2904,7 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
; GFX90A-NEXT: s_mov_b32 s4, 16
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
; GFX90A-NEXT: .LBB11_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -3129,7 +2952,6 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 {
; GFX942-LABEL: test_mfma_loop_non_splat_ret_use:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
@@ -3163,6 +2985,7 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 {
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
; GFX942-NEXT: .LBB11_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
index 800eb9efa571e..51cd564bdece3 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
@@ -95,123 +95,123 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v3, v0, a[0:31]
; GREEDY908-NEXT: s_nop 15
; GREEDY908-NEXT: s_nop 1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a32
-; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a33
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a32
+; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a61
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a60
+; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a33
+; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a59
+; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a58
+; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v1
; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a34
-; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v6
+; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a57
+; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a56
; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a35
-; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a36
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a35
+; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a55
+; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a54
+; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a36
+; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a53
+; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a52
+; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v1
; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a37
-; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v6
+; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a51
+; GREEDY908-NEXT: v_accvgpr_read_b32 v16, a50
; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a38
-; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a39
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a38
+; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a49
+; GREEDY908-NEXT: v_accvgpr_read_b32 v18, a48
+; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a39
+; GREEDY908-NEXT: v_accvgpr_read_b32 v19, a47
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a46
+; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v1
; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a40
-; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v19
; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a41
-; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a42
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a41
+; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v18
+; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v17
+; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a42
+; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v16
+; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v15
+; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v1
; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a43
-; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v14
+; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v13
; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a44
-; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a45
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a46
-; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v6
-; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a47
-; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a48
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a49
-; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v6
-; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a50
-; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a51
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a52
-; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v6
-; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a53
-; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a54
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a55
-; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v6
-; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a56
-; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a57
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a58
-; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v6
-; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a59
-; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a60
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a61
-; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a44
+; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v12
+; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v11
+; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a45
+; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v10
+; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v9
+; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v8
+; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v7
; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v6
-; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v5
; GREEDY908-NEXT: s_nop 0
; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; GREEDY908-NEXT: s_nop 15
; GREEDY908-NEXT: s_nop 1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a24
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a25
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a26
; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a27
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a26
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a25
+; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a24
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a28
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a29
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a30
; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a31
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a30
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a29
+; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a28
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a16
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a17
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a18
; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a19
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a18
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a17
+; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a16
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a20
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a21
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a22
; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a23
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a22
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a21
+; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a20
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10
; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9
+; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14
; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13
+; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2
; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6
; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5
+; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
; GREEDY908-NEXT: s_endpgm
@@ -499,73 +499,105 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; FAST90A-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0
; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x40
; FAST90A-NEXT: s_waitcnt lgkmcnt(0)
-; FAST90A-NEXT: v_accvgpr_write_b32 a0, s36
-; FAST90A-NEXT: v_accvgpr_write_b32 a1, s37
-; FAST90A-NEXT: v_accvgpr_write_b32 a2, s38
-; FAST90A-NEXT: v_accvgpr_write_b32 a3, s39
-; FAST90A-NEXT: v_accvgpr_write_b32 a4, s40
-; FAST90A-NEXT: v_accvgpr_write_b32 a5, s41
-; FAST90A-NEXT: v_accvgpr_write_b32 a6, s42
-; FAST90A-NEXT: v_accvgpr_write_b32 a7, s43
-; FAST90A-NEXT: v_accvgpr_write_b32 a8, s44
-; FAST90A-NEXT: v_accvgpr_write_b32 a9, s45
-; FAST90A-NEXT: v_accvgpr_write_b32 a10, s46
-; FAST90A-NEXT: v_accvgpr_write_b32 a11, s47
-; FAST90A-NEXT: v_accvgpr_write_b32 a12, s48
-; FAST90A-NEXT: v_accvgpr_write_b32 a13, s49
-; FAST90A-NEXT: v_accvgpr_write_b32 a14, s50
-; FAST90A-NEXT: v_accvgpr_write_b32 a15, s51
-; FAST90A-NEXT: v_accvgpr_write_b32 a16, s4
-; FAST90A-NEXT: v_accvgpr_write_b32 a17, s5
-; FAST90A-NEXT: v_accvgpr_write_b32 a18, s6
-; FAST90A-NEXT: v_accvgpr_write_b32 a19, s7
-; FAST90A-NEXT: v_accvgpr_write_b32 a20, s8
-; FAST90A-NEXT: v_accvgpr_write_b32 a21, s9
-; FAST90A-NEXT: v_accvgpr_write_b32 a22, s10
-; FAST90A-NEXT: v_accvgpr_write_b32 a23, s11
-; FAST90A-NEXT: v_accvgpr_write_b32 a24, s12
-; FAST90A-NEXT: v_accvgpr_write_b32 a25, s13
-; FAST90A-NEXT: v_accvgpr_write_b32 a26, s14
-; FAST90A-NEXT: v_accvgpr_write_b32 a27, s15
-; FAST90A-NEXT: v_accvgpr_write_b32 a28, s16
-; FAST90A-NEXT: v_accvgpr_write_b32 a29, s17
-; FAST90A-NEXT: v_accvgpr_write_b32 a30, s18
-; FAST90A-NEXT: v_accvgpr_write_b32 a31, s19
+; FAST90A-NEXT: v_accvgpr_write_b32 a32, s36
+; FAST90A-NEXT: v_accvgpr_write_b32 a33, s37
+; FAST90A-NEXT: v_accvgpr_write_b32 a34, s38
+; FAST90A-NEXT: v_accvgpr_write_b32 a35, s39
+; FAST90A-NEXT: v_accvgpr_write_b32 a36, s40
+; FAST90A-NEXT: v_accvgpr_write_b32 a37, s41
+; FAST90A-NEXT: v_accvgpr_write_b32 a38, s42
+; FAST90A-NEXT: v_accvgpr_write_b32 a39, s43
+; FAST90A-NEXT: v_accvgpr_write_b32 a40, s44
+; FAST90A-NEXT: v_accvgpr_write_b32 a41, s45
+; FAST90A-NEXT: v_accvgpr_write_b32 a42, s46
+; FAST90A-NEXT: v_accvgpr_write_b32 a43, s47
+; FAST90A-NEXT: v_accvgpr_write_b32 a44, s48
+; FAST90A-NEXT: v_accvgpr_write_b32 a45, s49
+; FAST90A-NEXT: v_accvgpr_write_b32 a46, s50
+; FAST90A-NEXT: v_accvgpr_write_b32 a47, s51
+; FAST90A-NEXT: v_accvgpr_write_b32 a48, s4
+; FAST90A-NEXT: v_accvgpr_write_b32 a49, s5
+; FAST90A-NEXT: v_accvgpr_write_b32 a50, s6
+; FAST90A-NEXT: v_accvgpr_write_b32 a51, s7
+; FAST90A-NEXT: v_accvgpr_write_b32 a52, s8
+; FAST90A-NEXT: v_accvgpr_write_b32 a53, s9
+; FAST90A-NEXT: v_accvgpr_write_b32 a54, s10
+; FAST90A-NEXT: v_accvgpr_write_b32 a55, s11
+; FAST90A-NEXT: v_accvgpr_write_b32 a56, s12
+; FAST90A-NEXT: v_accvgpr_write_b32 a57, s13
+; FAST90A-NEXT: v_accvgpr_write_b32 a58, s14
+; FAST90A-NEXT: v_accvgpr_write_b32 a59, s15
+; FAST90A-NEXT: v_accvgpr_write_b32 a60, s16
+; FAST90A-NEXT: v_accvgpr_write_b32 a61, s17
+; FAST90A-NEXT: v_accvgpr_write_b32 a62, s18
+; FAST90A-NEXT: v_accvgpr_write_b32 a63, s19
; FAST90A-NEXT: s_nop 1
-; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[0:31]
+; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
+; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63]
; FAST90A-NEXT: s_nop 15
; FAST90A-NEXT: s_nop 2
-; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a32
-; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a33
-; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a34
-; FAST90A-NEXT: v_accvgpr_mov_b32 a5, a35
-; FAST90A-NEXT: v_accvgpr_mov_b32 a6, a36
-; FAST90A-NEXT: v_accvgpr_mov_b32 a7, a37
-; FAST90A-NEXT: v_accvgpr_mov_b32 a8, a38
-; FAST90A-NEXT: v_accvgpr_mov_b32 a9, a39
-; FAST90A-NEXT: v_accvgpr_mov_b32 a10, a40
-; FAST90A-NEXT: v_accvgpr_mov_b32 a11, a41
-; FAST90A-NEXT: v_accvgpr_mov_b32 a12, a42
-; FAST90A-NEXT: v_accvgpr_mov_b32 a13, a43
-; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a44
-; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a45
-; FAST90A-NEXT: v_accvgpr_mov_b32 a16, a46
-; FAST90A-NEXT: v_accvgpr_mov_b32 a17, a47
-; FAST90A-NEXT: v_accvgpr_mov_b32 a18, a48
-; FAST90A-NEXT: v_accvgpr_mov_b32 a19, a49
-; FAST90A-NEXT: v_accvgpr_mov_b32 a20, a50
-; FAST90A-NEXT: v_accvgpr_mov_b32 a21, a51
-; FAST90A-NEXT: v_accvgpr_mov_b32 a22, a52
-; FAST90A-NEXT: v_accvgpr_mov_b32 a23, a53
-; FAST90A-NEXT: v_accvgpr_mov_b32 a24, a54
-; FAST90A-NEXT: v_accvgpr_mov_b32 a25, a55
-; FAST90A-NEXT: v_accvgpr_mov_b32 a26, a56
-; FAST90A-NEXT: v_accvgpr_mov_b32 a27, a57
-; FAST90A-NEXT: v_accvgpr_mov_b32 a28, a58
-; FAST90A-NEXT: v_accvgpr_mov_b32 a29, a59
-; FAST90A-NEXT: v_accvgpr_mov_b32 a30, a60
-; FAST90A-NEXT: v_accvgpr_mov_b32 a31, a61
+; FAST90A-NEXT: v_accvgpr_read_b32 v3, a29
+; FAST90A-NEXT: v_accvgpr_read_b32 v4, a28
+; FAST90A-NEXT: v_accvgpr_read_b32 v5, a27
+; FAST90A-NEXT: v_accvgpr_read_b32 v6, a26
+; FAST90A-NEXT: v_accvgpr_read_b32 v7, a25
+; FAST90A-NEXT: v_accvgpr_read_b32 v8, a24
+; FAST90A-NEXT: v_accvgpr_read_b32 v9, a23
+; FAST90A-NEXT: v_accvgpr_read_b32 v10, a22
+; FAST90A-NEXT: v_accvgpr_read_b32 v11, a21
+; FAST90A-NEXT: v_accvgpr_read_b32 v12, a20
+; FAST90A-NEXT: v_accvgpr_read_b32 v13, a19
+; FAST90A-NEXT: v_accvgpr_read_b32 v14, a18
+; FAST90A-NEXT: v_accvgpr_read_b32 v15, a17
+; FAST90A-NEXT: v_accvgpr_read_b32 v16, a16
+; FAST90A-NEXT: v_accvgpr_read_b32 v17, a15
+; FAST90A-NEXT: v_accvgpr_read_b32 v18, a14
+; FAST90A-NEXT: v_accvgpr_read_b32 v19, a13
+; FAST90A-NEXT: v_accvgpr_read_b32 v20, a12
+; FAST90A-NEXT: v_accvgpr_read_b32 v21, a11
+; FAST90A-NEXT: v_accvgpr_read_b32 v22, a10
+; FAST90A-NEXT: v_accvgpr_read_b32 v23, a9
+; FAST90A-NEXT: v_accvgpr_read_b32 v24, a8
+; FAST90A-NEXT: v_accvgpr_read_b32 v25, a7
+; FAST90A-NEXT: v_accvgpr_read_b32 v26, a6
+; FAST90A-NEXT: v_accvgpr_read_b32 v27, a5
+; FAST90A-NEXT: v_accvgpr_read_b32 v28, a4
+; FAST90A-NEXT: v_accvgpr_read_b32 v29, a3
+; FAST90A-NEXT: v_accvgpr_read_b32 v30, a2
+; FAST90A-NEXT: v_accvgpr_read_b32 v31, a1
+; FAST90A-NEXT: v_accvgpr_read_b32 v32, a0
+; FAST90A-NEXT: v_accvgpr_mov_b32 a0, a32
+; FAST90A-NEXT: v_accvgpr_mov_b32 a1, a33
+; FAST90A-NEXT: v_accvgpr_write_b32 a2, v32
+; FAST90A-NEXT: v_accvgpr_write_b32 a3, v31
+; FAST90A-NEXT: v_accvgpr_write_b32 a4, v30
+; FAST90A-NEXT: v_accvgpr_write_b32 a5, v29
+; FAST90A-NEXT: v_accvgpr_write_b32 a6, v28
+; FAST90A-NEXT: v_accvgpr_write_b32 a7, v27
+; FAST90A-NEXT: v_accvgpr_write_b32 a8, v26
+; FAST90A-NEXT: v_accvgpr_write_b32 a9, v25
+; FAST90A-NEXT: v_accvgpr_write_b32 a10, v24
+; FAST90A-NEXT: v_accvgpr_write_b32 a11, v23
+; FAST90A-NEXT: v_accvgpr_write_b32 a12, v22
+; FAST90A-NEXT: v_accvgpr_write_b32 a13, v21
+; FAST90A-NEXT: v_accvgpr_write_b32 a14, v20
+; FAST90A-NEXT: v_accvgpr_write_b32 a15, v19
+; FAST90A-NEXT: v_accvgpr_write_b32 a16, v18
+; FAST90A-NEXT: v_accvgpr_write_b32 a17, v17
+; FAST90A-NEXT: v_accvgpr_write_b32 a18, v16
+; FAST90A-NEXT: v_accvgpr_write_b32 a19, v15
+; FAST90A-NEXT: v_accvgpr_write_b32 a20, v14
+; FAST90A-NEXT: v_accvgpr_write_b32 a21, v13
+; FAST90A-NEXT: v_accvgpr_write_b32 a22, v12
+; FAST90A-NEXT: v_accvgpr_write_b32 a23, v11
+; FAST90A-NEXT: v_accvgpr_write_b32 a24, v10
+; FAST90A-NEXT: v_accvgpr_write_b32 a25, v9
+; FAST90A-NEXT: v_accvgpr_write_b32 a26, v8
+; FAST90A-NEXT: v_accvgpr_write_b32 a27, v7
+; FAST90A-NEXT: v_accvgpr_write_b32 a28, v6
+; FAST90A-NEXT: v_accvgpr_write_b32 a29, v5
+; FAST90A-NEXT: v_accvgpr_write_b32 a30, v4
+; FAST90A-NEXT: v_accvgpr_write_b32 a31, v3
; FAST90A-NEXT: s_nop 1
; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
; FAST90A-NEXT: s_nop 15
@@ -594,98 +626,82 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY908: ; %bb.0: ; %bb
; GREEDY908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GREEDY908-NEXT: v_mov_b32_e32 v0, 1.0
-; GREEDY908-NEXT: v_mov_b32_e32 v16, 0
+; GREEDY908-NEXT: v_mov_b32_e32 v4, 0
; GREEDY908-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY908-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GREEDY908-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY908-NEXT: v_mov_b32_e32 v17, s0
-; GREEDY908-NEXT: v_mov_b32_e32 v1, s1
-; GREEDY908-NEXT: v_mov_b32_e32 v2, s2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v17
-; GREEDY908-NEXT: v_mov_b32_e32 v17, s3
-; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v17
-; GREEDY908-NEXT: v_mov_b32_e32 v1, s4
-; GREEDY908-NEXT: v_mov_b32_e32 v2, s5
-; GREEDY908-NEXT: v_mov_b32_e32 v17, s6
-; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v17
-; GREEDY908-NEXT: v_mov_b32_e32 v1, s7
-; GREEDY908-NEXT: v_mov_b32_e32 v2, s8
-; GREEDY908-NEXT: v_mov_b32_e32 v17, s9
-; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v17
-; GREEDY908-NEXT: v_mov_b32_e32 v1, s10
-; GREEDY908-NEXT: v_mov_b32_e32 v2, s11
-; GREEDY908-NEXT: v_mov_b32_e32 v17, s12
-; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v17
-; GREEDY908-NEXT: v_mov_b32_e32 v1, s13
+; GREEDY908-NEXT: v_mov_b32_e32 v5, s15
; GREEDY908-NEXT: v_mov_b32_e32 v2, s14
-; GREEDY908-NEXT: v_mov_b32_e32 v17, s15
-; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v17
+; GREEDY908-NEXT: v_mov_b32_e32 v1, s13
+; GREEDY908-NEXT: v_accvgpr_write_b32 a33, v5
+; GREEDY908-NEXT: v_mov_b32_e32 v5, s12
+; GREEDY908-NEXT: v_accvgpr_write_b32 a32, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v5
+; GREEDY908-NEXT: v_mov_b32_e32 v2, s11
+; GREEDY908-NEXT: v_mov_b32_e32 v1, s10
+; GREEDY908-NEXT: v_mov_b32_e32 v5, s9
+; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v5
+; GREEDY908-NEXT: v_mov_b32_e32 v2, s8
+; GREEDY908-NEXT: v_mov_b32_e32 v1, s7
+; GREEDY908-NEXT: v_mov_b32_e32 v5, s6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v5
+; GREEDY908-NEXT: v_mov_b32_e32 v2, s5
+; GREEDY908-NEXT: v_mov_b32_e32 v1, s4
+; GREEDY908-NEXT: v_mov_b32_e32 v5, s3
+; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v5
+; GREEDY908-NEXT: v_mov_b32_e32 v2, s2
+; GREEDY908-NEXT: v_mov_b32_e32 v1, s1
+; GREEDY908-NEXT: v_mov_b32_e32 v5, s0
+; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v5
; GREEDY908-NEXT: v_mov_b32_e32 v1, 2.0
; GREEDY908-NEXT: s_nop 1
-; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15]
-; GREEDY908-NEXT: s_nop 9
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a16
-; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a17
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a18
-; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v3
-; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v17
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a19
-; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a20
-; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v3
-; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v17
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a21
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a22
-; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a23
-; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v3
-; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v17
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a24
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a25
-; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a26
-; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v3
-; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v17
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a27
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a28
-; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a29
-; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v3
-; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v17
+; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
+; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
+; GREEDY908-NEXT: s_nop 8
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a19
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a18
+; GREEDY908-NEXT: s_nop 0
+; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v2
+; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v3
; GREEDY908-NEXT: s_nop 0
; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; GREEDY908-NEXT: s_nop 9
-; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14
; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15
-; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a2
-; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a3
-; GREEDY908-NEXT: v_accvgpr_read_b32 v4, a8
-; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a9
-; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a10
-; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a11
-; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a4
-; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a5
-; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a6
-; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a7
-; GREEDY908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
-; GREEDY908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
-; GREEDY908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
-; GREEDY908-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17]
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13
+; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12
+; GREEDY908-NEXT: s_nop 1
+; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:48
+; GREEDY908-NEXT: s_nop 0
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9
+; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8
+; GREEDY908-NEXT: s_nop 1
+; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32
+; GREEDY908-NEXT: s_nop 0
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5
+; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4
+; GREEDY908-NEXT: s_nop 1
+; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16
+; GREEDY908-NEXT: s_nop 0
+; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0
+; GREEDY908-NEXT: s_nop 1
+; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GREEDY908-NEXT: s_endpgm
;
; GREEDY90A-LABEL: test_mfma_f32_16x16x1f32:
@@ -693,51 +709,39 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0
; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0
; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, s3
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a4, s4
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a5, s5
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a6, s6
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a7, s7
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a8, s8
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a9, s9
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a10, s10
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a11, s11
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a12, s12
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a13, s13
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a14, s14
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a15, s15
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a33, s15
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a32, s14
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s13
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s12
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s11
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s10
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s9
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s8
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s7
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s6
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s5
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s4
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s3
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s2
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s1
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s0
; GREEDY90A-NEXT: s_nop 1
-; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15]
-; GREEDY90A-NEXT: s_nop 10
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a2, a16
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a3, a17
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a4, a18
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a5, a19
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a6, a20
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a7, a21
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a8, a22
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a9, a23
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a10, a24
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a11, a25
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a12, a26
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a13, a27
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a14, a28
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a15, a29
+; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
+; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
+; GREEDY90A-NEXT: s_nop 9
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19
; GREEDY90A-NEXT: s_nop 1
; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; GREEDY90A-NEXT: v_mov_b32_e32 v0, 0
-; GREEDY90A-NEXT: s_nop 9
-; GREEDY90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GREEDY90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GREEDY90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GREEDY90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GREEDY90A-NEXT: s_nop 10
+; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
+; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
+; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
+; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17]
; GREEDY90A-NEXT: s_endpgm
;
; GREEDY942-LABEL: test_mfma_f32_16x16x1f32:
@@ -745,51 +749,39 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0
; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0
+; GREEDY942-NEXT: v_mov_b32_e32 v2, 0
; GREEDY942-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GREEDY942-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY942-NEXT: v_accvgpr_write_b32 a0, s0
-; GREEDY942-NEXT: v_accvgpr_write_b32 a1, s1
-; GREEDY942-NEXT: v_accvgpr_write_b32 a2, s2
-; GREEDY942-NEXT: v_accvgpr_write_b32 a3, s3
-; GREEDY942-NEXT: v_accvgpr_write_b32 a4, s4
-; GREEDY942-NEXT: v_accvgpr_write_b32 a5, s5
-; GREEDY942-NEXT: v_accvgpr_write_b32 a6, s6
-; GREEDY942-NEXT: v_accvgpr_write_b32 a7, s7
-; GREEDY942-NEXT: v_accvgpr_write_b32 a8, s8
-; GREEDY942-NEXT: v_accvgpr_write_b32 a9, s9
-; GREEDY942-NEXT: v_accvgpr_write_b32 a10, s10
-; GREEDY942-NEXT: v_accvgpr_write_b32 a11, s11
-; GREEDY942-NEXT: v_accvgpr_write_b32 a12, s12
-; GREEDY942-NEXT: v_accvgpr_write_b32 a13, s13
-; GREEDY942-NEXT: v_accvgpr_write_b32 a14, s14
-; GREEDY942-NEXT: v_accvgpr_write_b32 a15, s15
+; GREEDY942-NEXT: v_accvgpr_write_b32 a33, s15
+; GREEDY942-NEXT: v_accvgpr_write_b32 a32, s14
+; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s13
+; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s12
+; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s11
+; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s10
+; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s9
+; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s8
+; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s7
+; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s6
+; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s5
+; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s4
+; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s3
+; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s2
+; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s1
+; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s0
; GREEDY942-NEXT: s_nop 1
-; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
-; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[16:31], v0, v1, a[0:15]
-; GREEDY942-NEXT: s_nop 9
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a16
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a3, a17
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a4, a18
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a5, a19
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a6, a20
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a7, a21
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a8, a22
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a9, a23
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a10, a24
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a11, a25
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a12, a26
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a13, a27
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a14, a28
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a15, a29
+; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33]
+; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33]
+; GREEDY942-NEXT: s_nop 8
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19
; GREEDY942-NEXT: s_nop 1
; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
-; GREEDY942-NEXT: v_mov_b32_e32 v0, 0
-; GREEDY942-NEXT: s_nop 8
-; GREEDY942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GREEDY942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GREEDY942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GREEDY942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GREEDY942-NEXT: s_nop 9
+; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
+; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
+; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
+; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17]
; GREEDY942-NEXT: s_endpgm
;
; GREEDY90A-GISEL-LABEL: test_mfma_f32_16x16x1f32:
@@ -847,8 +839,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; FAST90A-LABEL: test_mfma_f32_16x16x1f32:
; FAST90A: ; %bb.0: ; %bb
; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; FAST90A-NEXT: v_mov_b32_e32 v0, 1.0
-; FAST90A-NEXT: v_mov_b32_e32 v1, 2.0
+; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0
+; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0
+; FAST90A-NEXT: v_mov_b32_e32 v0, 0
; FAST90A-NEXT: s_waitcnt lgkmcnt(0)
; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0
; FAST90A-NEXT: s_waitcnt lgkmcnt(0)
@@ -869,8 +862,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; FAST90A-NEXT: v_accvgpr_write_b32 a14, s18
; FAST90A-NEXT: v_accvgpr_write_b32 a15, s19
; FAST90A-NEXT: s_nop 1
-; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15]
+; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
+; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15]
; FAST90A-NEXT: s_nop 10
; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a16
; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a17
@@ -887,9 +880,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a28
; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a29
; FAST90A-NEXT: s_nop 1
-; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; FAST90A-NEXT: v_mov_b32_e32 v0, 0
-; FAST90A-NEXT: s_nop 9
+; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
+; FAST90A-NEXT: s_nop 10
; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
index c77042d0c96c3..cf244f0b1f884 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -6,10 +6,10 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
; GFX942-LABEL: matmul_kernel:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
; GFX942-NEXT: s_mov_b32 s2, 0
; GFX942-NEXT: v_accvgpr_write_b32 a1, 0
-; GFX942-NEXT: s_mov_b32 s6, 0
+; GFX942-NEXT: s_mov_b32 s3, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -18,33 +18,34 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
; GFX942-NEXT: s_branch .LBB0_2
; GFX942-NEXT: .LBB0_1: ; %bb2
; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; GFX942-NEXT: s_or_b32 s4, s3, 1
+; GFX942-NEXT: s_ashr_i32 s5, s3, 31
; GFX942-NEXT: s_mov_b32 s3, s2
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2
; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1
; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1
-; GFX942-NEXT: s_or_b32 s4, s6, 1
-; GFX942-NEXT: s_ashr_i32 s3, s6, 31
-; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[0:1], v[0:1], a[0:3]
-; GFX942-NEXT: s_and_b32 s6, s3, s4
-; GFX942-NEXT: s_nop 5
-; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2
+; GFX942-NEXT: s_and_b32 s3, s5, s4
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3]
; GFX942-NEXT: s_cbranch_execz .LBB0_4
; GFX942-NEXT: .LBB0_2: ; %bb
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1]
; GFX942-NEXT: s_cbranch_vccz .LBB0_1
; GFX942-NEXT: ; %bb.3:
-; GFX942-NEXT: ; implicit-def: $sgpr6
+; GFX942-NEXT: ; implicit-def: $sgpr3
+; GFX942-NEXT: ; implicit-def: $agpr2
; GFX942-NEXT: .LBB0_4: ; %common.ret
; GFX942-NEXT: s_endpgm
;
; GFX908-LABEL: matmul_kernel:
; GFX908: ; %bb.0: ; %entry
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX908-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a2, 0
; GFX908-NEXT: v_accvgpr_write_b32 a1, 0
; GFX908-NEXT: s_mov_b32 s2, 0
-; GFX908-NEXT: s_mov_b32 s6, 0
+; GFX908-NEXT: s_mov_b32 s3, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_cmp_lg_u32 s0, 0
; GFX908-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -53,28 +54,28 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
; GFX908-NEXT: s_branch .LBB0_2
; GFX908-NEXT: .LBB0_1: ; %bb2
; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; GFX908-NEXT: s_or_b32 s4, s3, 1
+; GFX908-NEXT: s_ashr_i32 s5, s3, 31
; GFX908-NEXT: s_mov_b32 s3, s2
-; GFX908-NEXT: v_mov_b32_e32 v0, s2
-; GFX908-NEXT: v_mov_b32_e32 v1, s3
+; GFX908-NEXT: v_mov_b32_e32 v1, s2
+; GFX908-NEXT: s_nop 2
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a2
+; GFX908-NEXT: v_mov_b32_e32 v2, s3
; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a1
-; GFX908-NEXT: s_or_b32 s4, s6, 1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
; GFX908-NEXT: v_accvgpr_write_b32 a2, v4
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v2
-; GFX908-NEXT: s_ashr_i32 s3, s6, 31
-; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[2:5], v[0:1], v[0:1], a[0:3]
-; GFX908-NEXT: s_and_b32 s6, s3, s4
-; GFX908-NEXT: s_nop 8
-; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: s_and_b32 s3, s5, s4
+; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[2:5], v[1:2], v[1:2], a[0:3]
; GFX908-NEXT: s_cbranch_execz .LBB0_4
; GFX908-NEXT: .LBB0_2: ; %bb
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1]
; GFX908-NEXT: s_cbranch_vccz .LBB0_1
; GFX908-NEXT: ; %bb.3:
-; GFX908-NEXT: ; implicit-def: $sgpr6
+; GFX908-NEXT: ; implicit-def: $sgpr3
+; GFX908-NEXT: ; implicit-def: $agpr2
; GFX908-NEXT: .LBB0_4: ; %common.ret
; GFX908-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
index 29f44282f06fc..01506d0af1913 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
@@ -83,12 +83,13 @@ body: |
; COALESCE-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
; COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
; COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
- ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ ; COALESCE-NEXT: undef [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; COALESCE-NEXT: {{ $}}
; COALESCE-NEXT: bb.1:
; COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
; COALESCE-NEXT: {{ $}}
+ ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0
; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
; COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
; COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
@@ -101,12 +102,12 @@ body: |
; COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc
; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
- ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_]]
- ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_]].sub1
- ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_]].sub1
- ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[AV_MOV_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; COALESCE-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
+ ; COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]]
+ ; COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub1
+ ; COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub1
+ ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec
; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
- ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = COPY [[V_MFMA_F32_16X16X16F16_e64_]].sub0
; COALESCE-NEXT: {{ $}}
; COALESCE-NEXT: bb.3:
; COALESCE-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000)
@@ -136,12 +137,13 @@ body: |
; GFX908-COALESCE-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
; GFX908-COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
; GFX908-COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
- ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ ; GFX908-COALESCE-NEXT: undef [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GFX908-COALESCE-NEXT: {{ $}}
; GFX908-COALESCE-NEXT: bb.1:
; GFX908-COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
; GFX908-COALESCE-NEXT: {{ $}}
+ ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0
; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
; GFX908-COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
; GFX908-COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
@@ -154,12 +156,12 @@ body: |
; GFX908-COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc
; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
- ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_]]
- ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_]].sub1
- ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_]].sub1
- ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[AV_MOV_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX908-COALESCE-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
+ ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]]
+ ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub1
+ ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub1
+ ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec
; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
- ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = COPY [[V_MFMA_F32_16X16X16F16_e64_]].sub0
; GFX908-COALESCE-NEXT: {{ $}}
; GFX908-COALESCE-NEXT: bb.3:
; GFX908-COALESCE-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000)
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
index 17458fa8b08a7..a9207de317ea1 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
@@ -74,7 +74,7 @@ body: |
; COALESCE-NEXT: successors: %bb.3(0x80000000)
; COALESCE-NEXT: {{ $}}
; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
- ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
+ ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec
; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
@@ -116,7 +116,7 @@ body: |
; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub0
; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub0
; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
- ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
+ ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec
; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index 110604a7cd88e..f4a9e7e8f2759 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -521,8 +521,8 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: v_readlane_b32 s16, v39, 22
; GFX908-NEXT: s_mov_b32 s12, s24
; GFX908-NEXT: s_mov_b32 s13, s23
-; GFX908-NEXT: v_mov_b32_e32 v31, v32
; GFX908-NEXT: s_mov_b32 s14, s22
+; GFX908-NEXT: v_mov_b32_e32 v31, v32
; GFX908-NEXT: s_mov_b32 s15, s21
; GFX908-NEXT: s_mov_b64 s[10:11], s[26:27]
; GFX908-NEXT: v_readlane_b32 s17, v39, 23
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
index 4e6b9166b3ed0..fc154604b8700 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
@@ -27,6 +27,7 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) %
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v33, v34, a[0:31]
+; CHECK-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec
; CHECK-NEXT: s_cbranch_execz .LBB0_3
; CHECK-NEXT: s_branch .LBB0_4
; CHECK-NEXT: .LBB0_2:
@@ -46,6 +47,7 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) %
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v33, a[0:31]
+; CHECK-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec
; CHECK-NEXT: .LBB0_4: ; %endif
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a[0:31]
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index ecada6b300aa1..b9e9893ede4e2 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -369,7 +369,7 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
+; CHECK-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:3]
; CHECK-NEXT: ;;#ASMEND
@@ -378,66 +378,73 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00
; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[4:7]
+; CHECK-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00
; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_accvgpr_write_b32 a0, s0
-; CHECK-NEXT: v_accvgpr_write_b32 a1, s1
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7]
-; CHECK-NEXT: s_nop 2
+; CHECK-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[12:13], v[4:7]
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[18:21], v[8:9], v[10:11], v[4:7]
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v2
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v3
; CHECK-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; CHECK-NEXT: v_mov_b32_e32 v5, v4
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: v_mov_b32_e32 v7, v4
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[14:17]
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[4:7]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[4:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7]
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[12:13], v[4:7]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[26:29], v[8:9], v[8:9], v[4:7]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[0:3]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[22:25]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[8:9], v[8:9], v[26:29]
; CHECK-NEXT: s_nop 5
-; CHECK-NEXT: v_cvt_f16_f32_e32 v17, v8
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15]
-; CHECK-NEXT: s_nop 2
-; CHECK-NEXT: v_mov_b64_e32 v[12:13], 0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3]
-; CHECK-NEXT: global_store_short v[12:13], v17, off
+; CHECK-NEXT: v_cvt_f16_f32_e32 v23, v14
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[18:21]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[12:13], v[8:9], v[0:3]
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_accvgpr_read_b32 v19, a3
+; CHECK-NEXT: v_accvgpr_read_b32 v18, a2
+; CHECK-NEXT: v_mov_b64_e32 v[20:21], 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_accvgpr_read_b32 v17, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v16, a0
+; CHECK-NEXT: v_cvt_f16_f32_e32 v15, v22
+; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[8:9], v[8:9], v[16:19]
+; CHECK-NEXT: v_cvt_f16_f32_e32 v12, v0
+; CHECK-NEXT: global_store_short v[20:21], v23, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v16
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT: global_store_short v[12:13], v9, off
-; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v8
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[10:11], v[8:9], v[4:7]
+; CHECK-NEXT: global_store_short v[20:21], v15, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v0
-; CHECK-NEXT: global_store_short v[12:13], v1, off
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23]
+; CHECK-NEXT: global_store_short v[20:21], v14, off
+; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v16
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: global_store_short v[12:13], v14, off
+; CHECK-NEXT: global_store_short v[20:21], v14, off
+; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11]
-; CHECK-NEXT: s_nop 6
-; CHECK-NEXT: v_cvt_f16_f32_e32 v8, v0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7]
-; CHECK-NEXT: global_store_short v[12:13], v8, off
+; CHECK-NEXT: global_store_short v[20:21], v12, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: s_nop 2
-; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT: global_store_short v[12:13], v0, off
+; CHECK-NEXT: global_store_short v[20:21], v0, off
; CHECK-NEXT: s_endpgm
entry:
%k0 = call <4 x float> asm sideeffect "; def $0", "=s"()
@@ -812,32 +819,32 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_subreg_
; CHECK-NEXT: ; def a[0:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_mov_b32_e32 v18, 4.0
-; CHECK-NEXT: v_accvgpr_mov_b32 a0, a1
-; CHECK-NEXT: v_accvgpr_mov_b32 a1, a2
-; CHECK-NEXT: v_accvgpr_mov_b32 a2, a3
-; CHECK-NEXT: v_accvgpr_mov_b32 a3, a4
-; CHECK-NEXT: v_accvgpr_mov_b32 a4, a5
-; CHECK-NEXT: v_accvgpr_mov_b32 a5, a6
-; CHECK-NEXT: v_accvgpr_mov_b32 a6, a7
-; CHECK-NEXT: v_accvgpr_mov_b32 a7, a8
-; CHECK-NEXT: v_accvgpr_mov_b32 a8, a9
-; CHECK-NEXT: v_accvgpr_mov_b32 a9, a10
-; CHECK-NEXT: v_accvgpr_mov_b32 a10, a11
-; CHECK-NEXT: v_accvgpr_mov_b32 a11, a12
-; CHECK-NEXT: v_accvgpr_mov_b32 a12, a13
-; CHECK-NEXT: v_accvgpr_mov_b32 a13, a14
-; CHECK-NEXT: v_accvgpr_mov_b32 a14, a15
-; CHECK-NEXT: v_accvgpr_mov_b32 a15, a16
+; CHECK-NEXT: v_accvgpr_mov_b32 a17, a16
+; CHECK-NEXT: v_accvgpr_mov_b32 a16, a15
+; CHECK-NEXT: v_accvgpr_mov_b32 a15, a14
+; CHECK-NEXT: v_accvgpr_mov_b32 a14, a13
+; CHECK-NEXT: v_accvgpr_mov_b32 a13, a12
+; CHECK-NEXT: v_accvgpr_mov_b32 a12, a11
+; CHECK-NEXT: v_accvgpr_mov_b32 a11, a10
+; CHECK-NEXT: v_accvgpr_mov_b32 a10, a9
+; CHECK-NEXT: v_accvgpr_mov_b32 a9, a8
+; CHECK-NEXT: v_accvgpr_mov_b32 a8, a7
+; CHECK-NEXT: v_accvgpr_mov_b32 a7, a6
+; CHECK-NEXT: v_accvgpr_mov_b32 a6, a5
+; CHECK-NEXT: v_accvgpr_mov_b32 a5, a4
+; CHECK-NEXT: v_accvgpr_mov_b32 a4, a3
+; CHECK-NEXT: v_accvgpr_mov_b32 a3, a2
+; CHECK-NEXT: v_accvgpr_mov_b32 a2, a1
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v18, a[0:15]
+; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v1, v18, a[2:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 6, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_nop 7
-; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; CHECK-NEXT: global_store_dwordx4 v0, a[14:17], s[0:1] offset:48
+; CHECK-NEXT: global_store_dwordx4 v0, a[10:13], s[0:1] offset:32
+; CHECK-NEXT: global_store_dwordx4 v0, a[6:9], s[0:1] offset:16
+; CHECK-NEXT: global_store_dwordx4 v0, a[2:5], s[0:1]
; CHECK-NEXT: s_endpgm
%def = call <32 x float> asm sideeffect "; def $0", "=a"()
%src2 = shufflevector <32 x float> %def, <32 x float> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
index 3ee558d6f8a9e..4d864ad15b411 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
@@ -576,9 +576,9 @@ define void @shufflevector_v2i32_10_physreg_even_agpr_pair_copy(ptr addrspace(1)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a4, a5
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a4
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a5
-; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17]
+; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a4
+; GFX90A-NEXT: v_accvgpr_mov_b32 a0, a5
+; GFX90A-NEXT: global_store_dwordx2 v0, a[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -590,9 +590,9 @@ define void @shufflevector_v2i32_10_physreg_even_agpr_pair_copy(ptr addrspace(1)
; GFX940-NEXT: ; def a4, a5
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_accvgpr_read_b32 v3, a4
-; GFX940-NEXT: v_accvgpr_read_b32 v2, a5
-; GFX940-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX940-NEXT: v_accvgpr_mov_b32 a1, a4
+; GFX940-NEXT: v_accvgpr_mov_b32 a0, a5
+; GFX940-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32 } asm "; def $0, $1", "={a4},={a5}"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
index 50cdf11eea2f7..34043cd067b25 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
@@ -413,27 +413,25 @@ define void @v_shuffle_v2f32_v3f32__5_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2f32_v3f32__5_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -674,27 +672,25 @@ define void @v_shuffle_v2f32_v3f32__2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2f32_v3f32__2_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll
index a6a84c780cb32..f65340470feb1 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll
@@ -413,27 +413,25 @@ define void @v_shuffle_v2i32_v3i32__5_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2i32_v3i32__5_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -674,27 +672,25 @@ define void @v_shuffle_v2i32_v3i32__2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2i32_v3i32__2_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll
index 0b20caea9cd95..51dc9a51ec9d0 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll
@@ -291,31 +291,27 @@ define void @v_shuffle_v2i64_v2i64__3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2i64_v2i64__3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2i64_v2i64__3_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -478,31 +474,27 @@ define void @v_shuffle_v2i64_v2i64__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2i64_v2i64__1_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2i64_v2i64__1_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll
index 2ecbf9622a259..7f8f2dbbb09a1 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll
@@ -291,31 +291,27 @@ define void @v_shuffle_v2p0_v2p0__3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2p0_v2p0__3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2p0_v2p0__3_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -478,31 +474,27 @@ define void @v_shuffle_v2p0_v2p0__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2p0_v2p0__1_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2p0_v2p0__1_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll
index bacec04ab7600..13e3d94c35446 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll
@@ -413,27 +413,25 @@ define void @v_shuffle_v2p3_v3p3__5_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2p3_v3p3__5_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -674,27 +672,25 @@ define void @v_shuffle_v2p3_v3p3__2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2p3_v3p3__2_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll
index fb71492fb867d..430f64164d24f 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll
@@ -170,15 +170,15 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -186,15 +186,15 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -273,27 +273,27 @@ define void @v_shuffle_v3f32_v2f32__3_2_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -469,29 +469,27 @@ define void @v_shuffle_v3f32_v2f32__3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_3_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -563,27 +561,26 @@ define void @v_shuffle_v3f32_v2f32__u_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__u_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__u_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -654,29 +651,27 @@ define void @v_shuffle_v3f32_v2f32__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -701,27 +696,26 @@ define void @v_shuffle_v3f32_v2f32__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -750,35 +744,32 @@ define void @v_shuffle_v3f32_v2f32__3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -808,33 +799,30 @@ define void @v_shuffle_v3f32_v2f32__3_u_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_u_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -864,35 +852,32 @@ define void @v_shuffle_v3f32_v2f32__3_1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_1_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_1_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v1, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -922,35 +907,33 @@ define void @v_shuffle_v3f32_v2f32__3_2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[4:5]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v5
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1262,35 +1245,34 @@ define void @v_shuffle_v3f32_v2f32__3_0_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1320,35 +1302,34 @@ define void @v_shuffle_v3f32_v2f32__3_2_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[4:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v5
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1478,29 +1459,27 @@ define void @v_shuffle_v3f32_v2f32__3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1527,27 +1506,25 @@ define void @v_shuffle_v3f32_v2f32__3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_u_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1576,35 +1553,34 @@ define void @v_shuffle_v3f32_v2f32__3_0_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[4:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v5
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1880,27 +1856,26 @@ define void @v_shuffle_v3f32_v2f32__3_u_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_u_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1930,35 +1905,34 @@ define void @v_shuffle_v3f32_v2f32__3_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_0_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[4:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v5
+; GFX942-NEXT: v_mov_b32_e32 v2, v5
+; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -2039,29 +2013,28 @@ define void @v_shuffle_v3f32_v2f32__3_2_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll
index 1ab87d6f19ec4..ef670e963bdb6 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll
@@ -416,27 +416,25 @@ define void @v_shuffle_v3f32_v3f32__5_3_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -961,29 +959,28 @@ define void @v_shuffle_v3f32_v3f32__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v3f32__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v3f32__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1010,29 +1007,27 @@ define void @v_shuffle_v3f32_v3f32__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v3f32__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1400,14 +1395,13 @@ define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1417,14 +1411,13 @@ define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2015,14 +2008,13 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2032,14 +2024,13 @@ define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2868,29 +2859,28 @@ define void @v_shuffle_v3f32_v3f32__4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v3f32__4_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v3f32__4_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2918,29 +2908,27 @@ define void @v_shuffle_v3f32_v3f32__5_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3019,14 +3007,13 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3036,14 +3023,13 @@ define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3181,29 +3167,27 @@ define void @v_shuffle_v3f32_v3f32__5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_4_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3626,14 +3610,13 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3643,14 +3626,13 @@ define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3791,29 +3773,27 @@ define void @v_shuffle_v3f32_v3f32__5_3_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll
index c5a08f098b4c6..50c69de069986 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll
@@ -965,29 +965,26 @@ define void @v_shuffle_v3f32_v4f32__7_7_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_7_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1014,29 +1011,26 @@ define void @v_shuffle_v3f32_v4f32__7_7_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_7_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1247,29 +1241,28 @@ define void @v_shuffle_v3f32_v4f32__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1296,29 +1289,26 @@ define void @v_shuffle_v3f32_v4f32__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1345,29 +1335,28 @@ define void @v_shuffle_v3f32_v4f32__3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__3_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__3_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1849,14 +1838,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1866,14 +1855,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -2678,14 +2667,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: v_mov_b32_e32 v10, v1
-; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2695,14 +2684,14 @@ define void @v_shuffle_v3f32_v4f32__7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -4675,29 +4664,28 @@ define void @v_shuffle_v3f32_v4f32__5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -4725,29 +4713,26 @@ define void @v_shuffle_v3f32_v4f32__6_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__6_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -4775,29 +4760,28 @@ define void @v_shuffle_v3f32_v4f32__7_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -4824,27 +4808,26 @@ define void @v_shuffle_v3f32_v4f32__7_u_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_u_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -4878,14 +4861,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: v_mov_b32_e32 v10, v2
-; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4895,14 +4878,15 @@ define void @v_shuffle_v3f32_v4f32__7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v2
-; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -5097,29 +5081,28 @@ define void @v_shuffle_v3f32_v4f32__7_5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_5_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_5_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -5146,29 +5129,29 @@ define void @v_shuffle_v3f32_v4f32__7_6_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_6_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -5692,14 +5675,14 @@ define void @v_shuffle_v3f32_v4f32__7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: v_mov_b32_e32 v10, v3
-; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5709,14 +5692,15 @@ define void @v_shuffle_v3f32_v4f32__7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v3
-; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -5915,29 +5899,28 @@ define void @v_shuffle_v3f32_v4f32__7_4_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -5964,29 +5947,29 @@ define void @v_shuffle_v3f32_v4f32__7_6_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_6_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -6723,29 +6706,29 @@ define void @v_shuffle_v3f32_v4f32__7_4_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_6:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -7501,29 +7484,29 @@ define void @v_shuffle_v3f32_v4f32__7_4_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll
index 91790ab5ff97f..ea4fac3b1d2b1 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll
@@ -170,15 +170,15 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -186,15 +186,15 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -273,27 +273,27 @@ define void @v_shuffle_v3i32_v2i32__3_2_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -469,29 +469,27 @@ define void @v_shuffle_v3i32_v2i32__3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_3_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -563,27 +561,26 @@ define void @v_shuffle_v3i32_v2i32__u_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__u_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__u_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -654,29 +651,27 @@ define void @v_shuffle_v3i32_v2i32__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -701,27 +696,26 @@ define void @v_shuffle_v3i32_v2i32__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -750,35 +744,32 @@ define void @v_shuffle_v3i32_v2i32__3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -808,33 +799,30 @@ define void @v_shuffle_v3i32_v2i32__3_u_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_u_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -864,35 +852,32 @@ define void @v_shuffle_v3i32_v2i32__3_1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_1_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_1_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v1, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -922,35 +907,33 @@ define void @v_shuffle_v3i32_v2i32__3_2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[4:5]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v5
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -1262,35 +1245,34 @@ define void @v_shuffle_v3i32_v2i32__3_0_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -1320,35 +1302,34 @@ define void @v_shuffle_v3i32_v2i32__3_2_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[4:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v5
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -1478,29 +1459,27 @@ define void @v_shuffle_v3i32_v2i32__3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -1527,27 +1506,25 @@ define void @v_shuffle_v3i32_v2i32__3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_u_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -1576,35 +1553,34 @@ define void @v_shuffle_v3i32_v2i32__3_0_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[4:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v5
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -1880,27 +1856,26 @@ define void @v_shuffle_v3i32_v2i32__3_u_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_u_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -1930,35 +1905,34 @@ define void @v_shuffle_v3i32_v2i32__3_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_0_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[4:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v5
+; GFX942-NEXT: v_mov_b32_e32 v2, v5
+; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -2039,29 +2013,28 @@ define void @v_shuffle_v3i32_v2i32__3_2_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll
index db780ced25148..7061c13b28d03 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll
@@ -416,27 +416,25 @@ define void @v_shuffle_v3i32_v3i32__5_3_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -961,29 +959,28 @@ define void @v_shuffle_v3i32_v3i32__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v3i32__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v3i32__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1010,29 +1007,27 @@ define void @v_shuffle_v3i32_v3i32__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v3i32__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1400,14 +1395,13 @@ define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1417,14 +1411,13 @@ define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2015,14 +2008,13 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2032,14 +2024,13 @@ define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2868,29 +2859,28 @@ define void @v_shuffle_v3i32_v3i32__4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v3i32__4_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v3i32__4_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2918,29 +2908,27 @@ define void @v_shuffle_v3i32_v3i32__5_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3019,14 +3007,13 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3036,14 +3023,13 @@ define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3181,29 +3167,27 @@ define void @v_shuffle_v3i32_v3i32__5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_4_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3626,14 +3610,13 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3643,14 +3626,13 @@ define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3791,29 +3773,27 @@ define void @v_shuffle_v3i32_v3i32__5_3_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll
index 92d6c95c26599..11d1897d0449f 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll
@@ -965,29 +965,26 @@ define void @v_shuffle_v3i32_v4i32__7_7_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_7_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1014,29 +1011,26 @@ define void @v_shuffle_v3i32_v4i32__7_7_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_7_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1247,29 +1241,28 @@ define void @v_shuffle_v3i32_v4i32__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1296,29 +1289,26 @@ define void @v_shuffle_v3i32_v4i32__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1345,29 +1335,28 @@ define void @v_shuffle_v3i32_v4i32__3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__3_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__3_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1849,14 +1838,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1866,14 +1855,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -2678,14 +2667,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: v_mov_b32_e32 v10, v1
-; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2695,14 +2684,14 @@ define void @v_shuffle_v3i32_v4i32__7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -4675,29 +4664,28 @@ define void @v_shuffle_v3i32_v4i32__5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -4725,29 +4713,26 @@ define void @v_shuffle_v3i32_v4i32__6_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__6_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -4775,29 +4760,28 @@ define void @v_shuffle_v3i32_v4i32__7_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -4824,27 +4808,26 @@ define void @v_shuffle_v3i32_v4i32__7_u_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_u_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -4878,14 +4861,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: v_mov_b32_e32 v10, v2
-; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4895,14 +4878,15 @@ define void @v_shuffle_v3i32_v4i32__7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v2
-; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -5097,29 +5081,28 @@ define void @v_shuffle_v3i32_v4i32__7_5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_5_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_5_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -5146,29 +5129,29 @@ define void @v_shuffle_v3i32_v4i32__7_6_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_6_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -5692,14 +5675,14 @@ define void @v_shuffle_v3i32_v4i32__7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: v_mov_b32_e32 v10, v3
-; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5709,14 +5692,15 @@ define void @v_shuffle_v3i32_v4i32__7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v3
-; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -5915,29 +5899,28 @@ define void @v_shuffle_v3i32_v4i32__7_4_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -5964,29 +5947,29 @@ define void @v_shuffle_v3i32_v4i32__7_6_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_6_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -6723,29 +6706,29 @@ define void @v_shuffle_v3i32_v4i32__7_4_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_6:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -7501,29 +7484,29 @@ define void @v_shuffle_v3i32_v4i32__7_4_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll
index bbca5039bb02c..a15fc3212f474 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll
@@ -291,31 +291,27 @@ define void @v_shuffle_v3i64_v2i64__3_2_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -699,32 +695,28 @@ define void @v_shuffle_v3i64_v2i64__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i64_v2i64__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i64_v2i64__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -1563,32 +1555,28 @@ define void @v_shuffle_v3i64_v2i64__3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -2160,32 +2148,28 @@ define void @v_shuffle_v3i64_v2i64__3_2_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_2_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll
index 8757639c501d2..fe132493ce536 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll
@@ -291,31 +291,27 @@ define void @v_shuffle_v3p0_v2p0__3_2_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -699,32 +695,28 @@ define void @v_shuffle_v3p0_v2p0__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p0_v2p0__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p0_v2p0__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -1563,32 +1555,28 @@ define void @v_shuffle_v3p0_v2p0__3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -2160,32 +2148,28 @@ define void @v_shuffle_v3p0_v2p0__3_2_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_2_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll
index 6d294b58ceeec..bd0100a4ffdb5 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll
@@ -170,15 +170,15 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -186,15 +186,15 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -273,27 +273,27 @@ define void @v_shuffle_v3p3_v2p3__3_2_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -469,29 +469,27 @@ define void @v_shuffle_v3p3_v2p3__3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_3_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -563,27 +561,26 @@ define void @v_shuffle_v3p3_v2p3__u_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__u_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__u_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -654,29 +651,27 @@ define void @v_shuffle_v3p3_v2p3__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -701,27 +696,26 @@ define void @v_shuffle_v3p3_v2p3__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[2:4], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[2:4], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -750,35 +744,32 @@ define void @v_shuffle_v3p3_v2p3__3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -808,33 +799,30 @@ define void @v_shuffle_v3p3_v2p3__3_u_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_u_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -864,35 +852,32 @@ define void @v_shuffle_v3p3_v2p3__3_1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_1_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_1_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v1, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -922,35 +907,33 @@ define void @v_shuffle_v3p3_v2p3__3_2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[4:5]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v5
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1262,35 +1245,34 @@ define void @v_shuffle_v3p3_v2p3__3_0_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1320,35 +1302,34 @@ define void @v_shuffle_v3p3_v2p3__3_2_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[4:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v5
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1478,29 +1459,27 @@ define void @v_shuffle_v3p3_v2p3__3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1527,27 +1506,25 @@ define void @v_shuffle_v3p3_v2p3__3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_u_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1576,35 +1553,34 @@ define void @v_shuffle_v3p3_v2p3__3_0_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[4:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v5
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1880,27 +1856,26 @@ define void @v_shuffle_v3p3_v2p3__3_u_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_u_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1930,35 +1905,34 @@ define void @v_shuffle_v3p3_v2p3__3_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_0_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:3]
+; GFX942-NEXT: ; def v[4:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v5
+; GFX942-NEXT: v_mov_b32_e32 v2, v5
+; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2039,29 +2013,28 @@ define void @v_shuffle_v3p3_v2p3__3_2_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v2, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: global_store_dwordx3 v2, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll
index 88d43df5938ee..cecd2a0e4b015 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll
@@ -416,27 +416,25 @@ define void @v_shuffle_v3p3_v3p3__5_3_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -961,29 +959,28 @@ define void @v_shuffle_v3p3_v3p3__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v3p3__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v3p3__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1010,29 +1007,27 @@ define void @v_shuffle_v3p3_v3p3__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v3p3__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1400,14 +1395,13 @@ define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1417,14 +1411,13 @@ define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2015,14 +2008,13 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2032,14 +2024,13 @@ define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2868,29 +2859,28 @@ define void @v_shuffle_v3p3_v3p3__4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v3p3__4_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v3p3__4_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2918,29 +2908,27 @@ define void @v_shuffle_v3p3_v3p3__5_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3019,14 +3007,13 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3036,14 +3023,13 @@ define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3181,29 +3167,27 @@ define void @v_shuffle_v3p3_v3p3__5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_4_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3626,14 +3610,13 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3643,14 +3626,13 @@ define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3791,29 +3773,27 @@ define void @v_shuffle_v3p3_v3p3__5_3_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll
index c9f194d873e35..834f03f013ba1 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll
@@ -965,29 +965,26 @@ define void @v_shuffle_v3p3_v4p3__7_7_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_7_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1014,29 +1011,26 @@ define void @v_shuffle_v3p3_v4p3__7_7_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_7_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1247,29 +1241,28 @@ define void @v_shuffle_v3p3_v4p3__1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__1_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__1_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1296,29 +1289,26 @@ define void @v_shuffle_v3p3_v4p3__2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__2_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1345,29 +1335,28 @@ define void @v_shuffle_v3p3_v4p3__3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__3_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__3_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1849,14 +1838,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1866,14 +1855,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2678,14 +2667,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: v_mov_b32_e32 v10, v1
-; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2695,14 +2684,14 @@ define void @v_shuffle_v3p3_v4p3__7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4675,29 +4664,28 @@ define void @v_shuffle_v3p3_v4p3__5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4725,29 +4713,26 @@ define void @v_shuffle_v3p3_v4p3__6_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__6_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4775,29 +4760,28 @@ define void @v_shuffle_v3p3_v4p3__7_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4824,27 +4808,26 @@ define void @v_shuffle_v3p3_v4p3__7_u_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_u_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4878,14 +4861,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: v_mov_b32_e32 v10, v2
-; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4895,14 +4878,15 @@ define void @v_shuffle_v3p3_v4p3__7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v2
-; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5097,29 +5081,28 @@ define void @v_shuffle_v3p3_v4p3__7_5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_5_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_5_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5146,29 +5129,29 @@ define void @v_shuffle_v3p3_v4p3__7_6_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_6_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5692,14 +5675,14 @@ define void @v_shuffle_v3p3_v4p3__7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: v_mov_b32_e32 v10, v3
-; GFX90A-NEXT: global_store_dwordx3 v6, v[8:10], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5709,14 +5692,15 @@ define void @v_shuffle_v3p3_v4p3__7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v3
-; GFX942-NEXT: global_store_dwordx3 v6, v[8:10], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5915,29 +5899,28 @@ define void @v_shuffle_v3p3_v4p3__7_4_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5964,29 +5947,29 @@ define void @v_shuffle_v3p3_v4p3__7_6_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_6_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -6723,29 +6706,29 @@ define void @v_shuffle_v3p3_v4p3__7_4_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_6:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -7501,29 +7484,29 @@ define void @v_shuffle_v3p3_v4p3__7_4_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: global_store_dwordx3 v4, v[6:8], s[16:17]
+; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: global_store_dwordx3 v4, v[6:8], s[0:1]
+; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll
index c7092f04a23ed..df148f299a165 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll
@@ -272,27 +272,27 @@ define void @v_shuffle_v4f32_v2f32__3_2_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_2_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v2f32__3_2_u_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -2380,29 +2380,28 @@ define void @v_shuffle_v4f32_v2f32__3_3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll
index 1224ab2b381c9..d4ee6fa20cad8 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll
@@ -255,15 +255,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -271,16 +271,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -414,27 +413,27 @@ define void @v_shuffle_v4f32_v3f32__5_3_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -554,16 +553,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -571,17 +569,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -612,16 +609,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -629,17 +626,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -722,29 +719,27 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -771,29 +766,28 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1037,31 +1031,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_5_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1089,31 +1083,28 @@ define void @v_shuffle_v4f32_v3f32__5_5_5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_5_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1189,29 +1180,28 @@ define void @v_shuffle_v4f32_v3f32__u_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1335,31 +1325,31 @@ define void @v_shuffle_v4f32_v3f32__2_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__2_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1385,29 +1375,28 @@ define void @v_shuffle_v4f32_v3f32__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1499,15 +1488,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1517,15 +1506,15 @@ define void @v_shuffle_v4f32_v3f32__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1556,33 +1545,34 @@ define void @v_shuffle_v4f32_v3f32__5_u_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1616,15 +1606,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1634,15 +1624,15 @@ define void @v_shuffle_v4f32_v3f32__5_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1674,17 +1664,17 @@ define void @v_shuffle_v4f32_v3f32__5_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1692,17 +1682,17 @@ define void @v_shuffle_v4f32_v3f32__5_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1734,17 +1724,17 @@ define void @v_shuffle_v4f32_v3f32__5_3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v9, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1752,17 +1742,17 @@ define void @v_shuffle_v4f32_v3f32__5_3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v9, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[6:8]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v8
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1853,15 +1843,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1871,15 +1861,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1911,16 +1901,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1928,16 +1918,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2028,17 +2018,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v6
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2046,17 +2036,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v6
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2088,17 +2078,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v9, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_mov_b32_e32 v2, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2106,17 +2096,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v9, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[6:8]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v8
+; GFX942-NEXT: v_mov_b32_e32 v1, v8
+; GFX942-NEXT: v_mov_b32_e32 v2, v6
+; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2613,17 +2603,16 @@ define void @v_shuffle_v4f32_v3f32__5_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2631,17 +2620,16 @@ define void @v_shuffle_v4f32_v3f32__5_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2673,17 +2661,16 @@ define void @v_shuffle_v4f32_v3f32__5_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2691,17 +2678,17 @@ define void @v_shuffle_v4f32_v3f32__5_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2733,17 +2720,16 @@ define void @v_shuffle_v4f32_v3f32__5_3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2751,17 +2737,16 @@ define void @v_shuffle_v4f32_v3f32__5_3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2849,17 +2834,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2867,17 +2851,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2909,16 +2893,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2926,16 +2909,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -2966,17 +2949,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2984,17 +2965,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3026,17 +3006,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v6
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3044,17 +3023,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v6
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3086,17 +3065,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3104,17 +3082,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3146,17 +3124,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3164,17 +3141,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v5
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3607,17 +3584,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3625,17 +3602,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[6:8]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v8
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3894,16 +3871,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v6
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3911,16 +3888,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v6
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -3952,17 +3929,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v6
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3970,17 +3946,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[6:8]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v6
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v8
+; GFX942-NEXT: v_mov_b32_e32 v1, v8
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4070,37 +4045,36 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v6
-; GFX90A-NEXT: v_mov_b32_e32 v9, v6
-; GFX90A-NEXT: v_mov_b32_e32 v10, v4
-; GFX90A-NEXT: v_mov_b32_e32 v11, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_mov_b32_e32 v2, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[6:8]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v6
-; GFX942-NEXT: v_mov_b32_e32 v9, v6
-; GFX942-NEXT: v_mov_b32_e32 v10, v4
-; GFX942-NEXT: v_mov_b32_e32 v11, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v8
+; GFX942-NEXT: v_mov_b32_e32 v1, v8
+; GFX942-NEXT: v_mov_b32_e32 v2, v6
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4384,31 +4358,31 @@ define void @v_shuffle_v4f32_v3f32__5_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4436,27 +4410,29 @@ define void @v_shuffle_v4f32_v3f32__5_u_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4487,17 +4463,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4505,17 +4481,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4604,35 +4580,36 @@ define void @v_shuffle_v4f32_v3f32__5_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v4
-; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_2_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v4
-; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4660,31 +4637,31 @@ define void @v_shuffle_v4f32_v3f32__5_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_4_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_4_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4712,31 +4689,31 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4764,29 +4741,28 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -4817,17 +4793,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4835,17 +4810,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5401,27 +5376,28 @@ define void @v_shuffle_v4f32_v3f32__5_u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5452,17 +5428,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5470,17 +5446,17 @@ define void @v_shuffle_v4f32_v3f32__5_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v5
+; GFX942-NEXT: v_mov_b32_e32 v3, v5
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5567,35 +5543,36 @@ define void @v_shuffle_v4f32_v3f32__5_2_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_2_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v5
+; GFX942-NEXT: v_mov_b32_e32 v3, v5
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5622,31 +5599,28 @@ define void @v_shuffle_v4f32_v3f32__5_3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5674,31 +5648,28 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5726,29 +5697,27 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5780,17 +5749,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5798,17 +5766,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v5
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5840,17 +5808,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5858,17 +5826,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v5
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -5954,31 +5922,27 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -6544,31 +6508,30 @@ define void @v_shuffle_v4f32_v3f32__5_3_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_5_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -6641,29 +6604,28 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -6695,17 +6657,16 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v6
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6713,17 +6674,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -6869,31 +6830,28 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll
index d5bd41397c4f0..edc540edb3ad1 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll
@@ -963,29 +963,26 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1012,29 +1009,26 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1446,31 +1440,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_7_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_7_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -1740,31 +1734,28 @@ define void @v_shuffle_v4f32_v4f32__2_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__2_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -2794,15 +2785,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v2
-; GFX90A-NEXT: v_mov_b32_e32 v11, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2812,15 +2802,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v11, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v2
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -4149,15 +4138,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v2
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4167,15 +4155,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v2
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -4209,15 +4196,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v3
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4227,15 +4213,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v3
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -5463,37 +5448,34 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, v7
-; GFX90A-NEXT: v_mov_b32_e32 v11, v7
-; GFX90A-NEXT: v_mov_b32_e32 v12, v4
-; GFX90A-NEXT: v_mov_b32_e32 v13, v2
-; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v7
+; GFX90A-NEXT: v_mov_b32_e32 v8, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:3]
+; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:7]
+; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v10, v7
-; GFX942-NEXT: v_mov_b32_e32 v11, v7
-; GFX942-NEXT: v_mov_b32_e32 v12, v4
-; GFX942-NEXT: v_mov_b32_e32 v13, v2
-; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v7
+; GFX942-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -7190,31 +7172,28 @@ define void @v_shuffle_v4f32_v4f32__6_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_4_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__6_4_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -7292,29 +7271,28 @@ define void @v_shuffle_v4f32_v4f32__7_u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -7581,31 +7559,30 @@ define void @v_shuffle_v4f32_v4f32__7_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -7683,31 +7660,28 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -7789,15 +7763,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v2
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7807,15 +7780,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v11, v2
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -8069,31 +8041,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_6_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -8974,31 +8946,28 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -9078,15 +9047,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -9096,15 +9064,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v11, v3
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -9138,15 +9105,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v1
-; GFX90A-NEXT: v_mov_b32_e32 v11, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -9156,15 +9122,14 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v11, v3
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -9312,31 +9277,28 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -9365,31 +9327,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_6_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -10357,15 +10319,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v4
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v5
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v4
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -10375,15 +10337,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_6(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v11, v4
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v5
+; GFX942-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v4
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -10591,31 +10553,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_6:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -11815,31 +11777,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -11868,31 +11830,31 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll
index 03503c9dac197..9d3affa6da266 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll
@@ -272,27 +272,27 @@ define void @v_shuffle_v4i32_v2i32__3_2_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_2_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v2i32__3_2_u_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
@@ -2386,29 +2386,28 @@ define void @v_shuffle_v4i32_v2i32__3_3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll
index 0222f73fbd193..1a669adf2b635 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll
@@ -255,15 +255,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -271,16 +271,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -414,27 +413,27 @@ define void @v_shuffle_v4i32_v3i32__5_3_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -554,16 +553,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -571,17 +569,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -612,16 +609,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -629,17 +626,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -722,29 +719,27 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -771,29 +766,28 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1037,31 +1031,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_5_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1089,31 +1083,28 @@ define void @v_shuffle_v4i32_v3i32__5_5_5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_5_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1189,29 +1180,28 @@ define void @v_shuffle_v4i32_v3i32__u_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1335,31 +1325,31 @@ define void @v_shuffle_v4i32_v3i32__2_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__2_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1385,29 +1375,28 @@ define void @v_shuffle_v4i32_v3i32__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1499,15 +1488,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1517,15 +1506,15 @@ define void @v_shuffle_v4i32_v3i32__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1556,33 +1545,34 @@ define void @v_shuffle_v4i32_v3i32__5_u_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1616,15 +1606,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1634,15 +1624,15 @@ define void @v_shuffle_v4i32_v3i32__5_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1674,17 +1664,17 @@ define void @v_shuffle_v4i32_v3i32__5_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1692,17 +1682,17 @@ define void @v_shuffle_v4i32_v3i32__5_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1734,17 +1724,17 @@ define void @v_shuffle_v4i32_v3i32__5_3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v9, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1752,17 +1742,17 @@ define void @v_shuffle_v4i32_v3i32__5_3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v9, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[6:8]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v8
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1853,15 +1843,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1871,15 +1861,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -1911,16 +1901,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1928,16 +1918,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2028,17 +2018,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v6
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2046,17 +2036,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v6
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2088,17 +2078,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v9, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_mov_b32_e32 v2, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2106,17 +2096,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v9, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[6:8]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v8
+; GFX942-NEXT: v_mov_b32_e32 v1, v8
+; GFX942-NEXT: v_mov_b32_e32 v2, v6
+; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2613,17 +2603,16 @@ define void @v_shuffle_v4i32_v3i32__5_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2631,17 +2620,16 @@ define void @v_shuffle_v4i32_v3i32__5_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2673,17 +2661,16 @@ define void @v_shuffle_v4i32_v3i32__5_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2691,17 +2678,17 @@ define void @v_shuffle_v4i32_v3i32__5_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2733,17 +2720,16 @@ define void @v_shuffle_v4i32_v3i32__5_3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2751,17 +2737,16 @@ define void @v_shuffle_v4i32_v3i32__5_3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2849,17 +2834,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2867,17 +2851,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2909,16 +2893,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2926,16 +2909,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -2966,17 +2949,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2984,17 +2965,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3026,17 +3006,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v6
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3044,17 +3023,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v6
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3086,17 +3065,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3104,17 +3082,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3146,17 +3124,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3164,17 +3141,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v5
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3607,17 +3584,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3625,17 +3602,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[6:8]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v8
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3894,16 +3871,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v6
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3911,16 +3888,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v6
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -3952,17 +3929,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v6
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3970,17 +3946,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[6:8]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v6
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v8
+; GFX942-NEXT: v_mov_b32_e32 v1, v8
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4070,37 +4045,36 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v6
-; GFX90A-NEXT: v_mov_b32_e32 v9, v6
-; GFX90A-NEXT: v_mov_b32_e32 v10, v4
-; GFX90A-NEXT: v_mov_b32_e32 v11, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_mov_b32_e32 v2, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[6:8]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v6
-; GFX942-NEXT: v_mov_b32_e32 v9, v6
-; GFX942-NEXT: v_mov_b32_e32 v10, v4
-; GFX942-NEXT: v_mov_b32_e32 v11, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v8
+; GFX942-NEXT: v_mov_b32_e32 v1, v8
+; GFX942-NEXT: v_mov_b32_e32 v2, v6
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4384,31 +4358,31 @@ define void @v_shuffle_v4i32_v3i32__5_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4436,27 +4410,29 @@ define void @v_shuffle_v4i32_v3i32__5_u_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4487,17 +4463,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4505,17 +4481,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4604,35 +4580,36 @@ define void @v_shuffle_v4i32_v3i32__5_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v4
-; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_2_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v4
-; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4660,31 +4637,31 @@ define void @v_shuffle_v4i32_v3i32__5_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_4_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_4_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4712,31 +4689,31 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4764,29 +4741,28 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -4817,17 +4793,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4835,17 +4810,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5401,27 +5376,28 @@ define void @v_shuffle_v4i32_v3i32__5_u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5452,17 +5428,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5470,17 +5446,17 @@ define void @v_shuffle_v4i32_v3i32__5_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v5
+; GFX942-NEXT: v_mov_b32_e32 v3, v5
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5567,35 +5543,36 @@ define void @v_shuffle_v4i32_v3i32__5_2_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_2_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v5
+; GFX942-NEXT: v_mov_b32_e32 v3, v5
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5622,31 +5599,28 @@ define void @v_shuffle_v4i32_v3i32__5_3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5674,31 +5648,28 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5726,29 +5697,27 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5780,17 +5749,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5798,17 +5766,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v5
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5840,17 +5808,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5858,17 +5826,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v5
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -5954,31 +5922,27 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -6544,31 +6508,30 @@ define void @v_shuffle_v4i32_v3i32__5_3_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_5_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -6641,29 +6604,28 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -6695,17 +6657,16 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v6
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6713,17 +6674,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
@@ -6869,31 +6830,28 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll
index ee2f94b90ffa9..983afa566e2c1 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll
@@ -963,29 +963,26 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1012,29 +1009,26 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1446,31 +1440,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_7_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_7_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -1740,31 +1734,28 @@ define void @v_shuffle_v4i32_v4i32__2_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__2_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -2794,15 +2785,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v2
-; GFX90A-NEXT: v_mov_b32_e32 v11, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2812,15 +2802,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v11, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v2
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -4149,15 +4138,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v2
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4167,15 +4155,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v2
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -4209,15 +4196,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v3
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4227,15 +4213,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v3
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -5463,37 +5448,34 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, v7
-; GFX90A-NEXT: v_mov_b32_e32 v11, v7
-; GFX90A-NEXT: v_mov_b32_e32 v12, v4
-; GFX90A-NEXT: v_mov_b32_e32 v13, v2
-; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v7
+; GFX90A-NEXT: v_mov_b32_e32 v8, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:3]
+; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:7]
+; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v10, v7
-; GFX942-NEXT: v_mov_b32_e32 v11, v7
-; GFX942-NEXT: v_mov_b32_e32 v12, v4
-; GFX942-NEXT: v_mov_b32_e32 v13, v2
-; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v7
+; GFX942-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -7190,31 +7172,28 @@ define void @v_shuffle_v4i32_v4i32__6_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_4_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__6_4_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -7292,29 +7271,28 @@ define void @v_shuffle_v4i32_v4i32__7_u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -7581,31 +7559,30 @@ define void @v_shuffle_v4i32_v4i32__7_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -7683,31 +7660,28 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -7789,15 +7763,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v2
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7807,15 +7780,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v11, v2
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -8069,31 +8041,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_6_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -8974,31 +8946,28 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -9078,15 +9047,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -9096,15 +9064,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v11, v3
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -9138,15 +9105,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v1
-; GFX90A-NEXT: v_mov_b32_e32 v11, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -9156,15 +9122,14 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v11, v3
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -9312,31 +9277,28 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -9365,31 +9327,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_6_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -10357,15 +10319,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v4
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v5
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v4
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -10375,15 +10337,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_6(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v11, v4
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v5
+; GFX942-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v4
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -10591,31 +10553,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_6:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -11815,31 +11777,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
@@ -11868,31 +11830,31 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i32> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll
index 21ec9acf6317d..ac7d9557ce765 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll
@@ -291,31 +291,27 @@ define void @v_shuffle_v4i64_v2i64__3_2_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_u_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -628,15 +624,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -646,18 +642,18 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -756,15 +752,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -774,15 +770,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -957,39 +953,33 @@ define void @v_shuffle_v4i64_v2i64__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v2i64__1_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i64_v2i64__1_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -1216,18 +1206,18 @@ define void @v_shuffle_v4i64_v2i64__3_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v10, v0
; GFX90A-NEXT: v_mov_b32_e32 v11, v1
-; GFX90A-NEXT: v_mov_b32_e32 v12, v0
-; GFX90A-NEXT: v_mov_b32_e32 v13, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1237,18 +1227,18 @@ define void @v_shuffle_v4i64_v2i64__3_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v12, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v10, v0
; GFX942-NEXT: v_mov_b32_e32 v11, v1
-; GFX942-NEXT: v_mov_b32_e32 v12, v0
-; GFX942-NEXT: v_mov_b32_e32 v13, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -1566,15 +1556,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1584,18 +1574,18 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -1673,33 +1663,33 @@ define void @v_shuffle_v4i64_v2i64__0_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v2i64__0_1_1_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i64_v2i64__0_1_1_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v2
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -1971,17 +1961,17 @@ define void @v_shuffle_v4i64_v2i64__3_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
; GFX90A-NEXT: v_mov_b32_e32 v10, v2
; GFX90A-NEXT: v_mov_b32_e32 v11, v3
-; GFX90A-NEXT: v_mov_b32_e32 v12, v2
-; GFX90A-NEXT: v_mov_b32_e32 v13, v3
; GFX90A-NEXT: v_mov_b32_e32 v2, v6
; GFX90A-NEXT: v_mov_b32_e32 v3, v7
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1994,17 +1984,17 @@ define void @v_shuffle_v4i64_v2i64__3_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v12, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
; GFX942-NEXT: v_mov_b32_e32 v10, v2
; GFX942-NEXT: v_mov_b32_e32 v11, v3
-; GFX942-NEXT: v_mov_b32_e32 v12, v2
-; GFX942-NEXT: v_mov_b32_e32 v13, v3
; GFX942-NEXT: v_mov_b32_e32 v2, v6
; GFX942-NEXT: v_mov_b32_e32 v3, v7
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -2457,39 +2447,33 @@ define void @v_shuffle_v4i64_v2i64__3_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -2524,15 +2508,15 @@ define void @v_shuffle_v4i64_v2i64__3_u_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2542,15 +2526,15 @@ define void @v_shuffle_v4i64_v2i64__3_u_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -2593,17 +2577,17 @@ define void @v_shuffle_v4i64_v2i64__3_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
; GFX90A-NEXT: v_mov_b32_e32 v8, v2
; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: v_mov_b32_e32 v10, v2
-; GFX90A-NEXT: v_mov_b32_e32 v11, v3
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2613,21 +2597,21 @@ define void @v_shuffle_v4i64_v2i64__3_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
; GFX942-NEXT: v_mov_b32_e32 v8, v2
; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: v_mov_b32_e32 v10, v2
-; GFX942-NEXT: v_mov_b32_e32 v11, v3
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -2665,18 +2649,18 @@ define void @v_shuffle_v4i64_v2i64__3_1_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v8, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
; GFX90A-NEXT: v_mov_b32_e32 v10, v4
; GFX90A-NEXT: v_mov_b32_e32 v11, v5
-; GFX90A-NEXT: v_mov_b32_e32 v12, v4
-; GFX90A-NEXT: v_mov_b32_e32 v13, v5
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2686,19 +2670,19 @@ define void @v_shuffle_v4i64_v2i64__3_1_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v12, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
; GFX942-NEXT: v_mov_b32_e32 v10, v4
; GFX942-NEXT: v_mov_b32_e32 v11, v5
-; GFX942-NEXT: v_mov_b32_e32 v12, v4
-; GFX942-NEXT: v_mov_b32_e32 v13, v5
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -2733,15 +2717,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2751,15 +2735,15 @@ define void @v_shuffle_v4i64_v2i64__3_3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -2792,13 +2776,13 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2808,13 +2792,13 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -3150,33 +3134,33 @@ define void @v_shuffle_v4i64_v2i64__2_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v2i64__2_3_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i64_v2i64__2_3_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v2
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
@@ -3390,39 +3374,39 @@ define void @v_shuffle_v4i64_v2i64__3_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_2_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v2
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x i64> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
index 615b382aa355a..8dd4a40d00680 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll
@@ -1126,15 +1126,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1144,15 +1144,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -1388,15 +1388,13 @@ define void @v_shuffle_v4i64_v3i64__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1406,15 +1404,13 @@ define void @v_shuffle_v4i64_v3i64__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -3641,33 +3637,33 @@ define void @v_shuffle_v4i64_v3i64__1_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_2_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
; GFX90A-NEXT: v_mov_b32_e32 v8, v4
; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v4
-; GFX90A-NEXT: v_mov_b32_e32 v11, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i64_v3i64__1_2_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v5
; GFX942-NEXT: v_mov_b32_e32 v8, v4
; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v4
-; GFX942-NEXT: v_mov_b32_e32 v11, v5
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -4791,15 +4787,13 @@ define void @v_shuffle_v4i64_v3i64__4_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4809,15 +4803,13 @@ define void @v_shuffle_v4i64_v3i64__4_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -5177,15 +5169,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
; GFX90A-NEXT: v_mov_b32_e32 v8, v0
; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5195,15 +5187,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
; GFX942-NEXT: v_mov_b32_e32 v8, v0
; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -5540,15 +5532,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5558,15 +5550,15 @@ define void @v_shuffle_v4i64_v3i64__5_5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -6263,17 +6255,17 @@ define void @v_shuffle_v4i64_v3i64__5_3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
; GFX90A-NEXT: v_mov_b32_e32 v8, v2
; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: v_mov_b32_e32 v10, v2
-; GFX90A-NEXT: v_mov_b32_e32 v11, v3
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6283,17 +6275,17 @@ define void @v_shuffle_v4i64_v3i64__5_3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
; GFX942-NEXT: v_mov_b32_e32 v8, v2
; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: v_mov_b32_e32 v10, v2
-; GFX942-NEXT: v_mov_b32_e32 v11, v3
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -6978,33 +6970,33 @@ define void @v_shuffle_v4i64_v3i64__4_5_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4i64_v3i64__4_5_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
; GFX90A-NEXT: v_mov_b32_e32 v8, v4
; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v4
-; GFX90A-NEXT: v_mov_b32_e32 v11, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4i64_v3i64__4_5_5_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v5
; GFX942-NEXT: v_mov_b32_e32 v8, v4
; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v4
-; GFX942-NEXT: v_mov_b32_e32 v11, v5
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
@@ -7352,15 +7344,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
; GFX90A-NEXT: v_mov_b32_e32 v8, v4
; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v4
-; GFX90A-NEXT: v_mov_b32_e32 v11, v5
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7370,15 +7362,15 @@ define void @v_shuffle_v4i64_v3i64__5_4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v5
; GFX942-NEXT: v_mov_b32_e32 v8, v4
; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v4
-; GFX942-NEXT: v_mov_b32_e32 v11, v5
; GFX942-NEXT: v_mov_b32_e32 v0, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x i64> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll
index 32f6e00716e37..ea9ef2f1ac94a 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll
@@ -8328,15 +8328,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v10, v0
; GFX90A-NEXT: v_mov_b32_e32 v11, v1
-; GFX90A-NEXT: v_mov_b32_e32 v12, v0
-; GFX90A-NEXT: v_mov_b32_e32 v13, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -8346,15 +8346,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v12, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v10, v0
; GFX942-NEXT: v_mov_b32_e32 v11, v1
-; GFX942-NEXT: v_mov_b32_e32 v12, v0
-; GFX942-NEXT: v_mov_b32_e32 v13, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i64> asm "; def $0", "=v"()
@@ -11254,15 +11254,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
; GFX90A-NEXT: v_mov_b32_e32 v10, v4
; GFX90A-NEXT: v_mov_b32_e32 v11, v5
-; GFX90A-NEXT: v_mov_b32_e32 v12, v4
-; GFX90A-NEXT: v_mov_b32_e32 v13, v5
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -11272,15 +11272,15 @@ define void @v_shuffle_v4i64_v4i64__7_5_6_6(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v12, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
; GFX942-NEXT: v_mov_b32_e32 v10, v4
; GFX942-NEXT: v_mov_b32_e32 v11, v5
-; GFX942-NEXT: v_mov_b32_e32 v12, v4
-; GFX942-NEXT: v_mov_b32_e32 v13, v5
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x i64> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll
index ee3b303f88471..b30af835a7882 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll
@@ -291,31 +291,27 @@ define void @v_shuffle_v4p0_v2p0__3_2_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_u_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -628,15 +624,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -646,18 +642,18 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -756,15 +752,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -774,15 +770,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -957,39 +953,33 @@ define void @v_shuffle_v4p0_v2p0__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v2p0__1_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p0_v2p0__1_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -1216,18 +1206,18 @@ define void @v_shuffle_v4p0_v2p0__3_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v10, v0
; GFX90A-NEXT: v_mov_b32_e32 v11, v1
-; GFX90A-NEXT: v_mov_b32_e32 v12, v0
-; GFX90A-NEXT: v_mov_b32_e32 v13, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1237,18 +1227,18 @@ define void @v_shuffle_v4p0_v2p0__3_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v12, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v10, v0
; GFX942-NEXT: v_mov_b32_e32 v11, v1
-; GFX942-NEXT: v_mov_b32_e32 v12, v0
-; GFX942-NEXT: v_mov_b32_e32 v13, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -1566,15 +1556,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1584,18 +1574,18 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -1673,33 +1663,33 @@ define void @v_shuffle_v4p0_v2p0__0_1_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v2p0__0_1_1_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p0_v2p0__0_1_1_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v2
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -1971,17 +1961,17 @@ define void @v_shuffle_v4p0_v2p0__3_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v3
; GFX90A-NEXT: v_mov_b32_e32 v10, v2
; GFX90A-NEXT: v_mov_b32_e32 v11, v3
-; GFX90A-NEXT: v_mov_b32_e32 v12, v2
-; GFX90A-NEXT: v_mov_b32_e32 v13, v3
; GFX90A-NEXT: v_mov_b32_e32 v2, v6
; GFX90A-NEXT: v_mov_b32_e32 v3, v7
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1994,17 +1984,17 @@ define void @v_shuffle_v4p0_v2p0__3_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v12, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v3
; GFX942-NEXT: v_mov_b32_e32 v10, v2
; GFX942-NEXT: v_mov_b32_e32 v11, v3
-; GFX942-NEXT: v_mov_b32_e32 v12, v2
-; GFX942-NEXT: v_mov_b32_e32 v13, v3
; GFX942-NEXT: v_mov_b32_e32 v2, v6
; GFX942-NEXT: v_mov_b32_e32 v3, v7
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -2457,39 +2447,33 @@ define void @v_shuffle_v4p0_v2p0__3_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -2524,15 +2508,15 @@ define void @v_shuffle_v4p0_v2p0__3_u_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2542,15 +2526,15 @@ define void @v_shuffle_v4p0_v2p0__3_u_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -2593,17 +2577,17 @@ define void @v_shuffle_v4p0_v2p0__3_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
; GFX90A-NEXT: v_mov_b32_e32 v8, v2
; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: v_mov_b32_e32 v10, v2
-; GFX90A-NEXT: v_mov_b32_e32 v11, v3
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2613,21 +2597,21 @@ define void @v_shuffle_v4p0_v2p0__3_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
; GFX942-NEXT: v_mov_b32_e32 v8, v2
; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: v_mov_b32_e32 v10, v2
-; GFX942-NEXT: v_mov_b32_e32 v11, v3
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -2665,18 +2649,18 @@ define void @v_shuffle_v4p0_v2p0__3_1_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v8, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
; GFX90A-NEXT: v_mov_b32_e32 v10, v4
; GFX90A-NEXT: v_mov_b32_e32 v11, v5
-; GFX90A-NEXT: v_mov_b32_e32 v12, v4
-; GFX90A-NEXT: v_mov_b32_e32 v13, v5
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2686,19 +2670,19 @@ define void @v_shuffle_v4p0_v2p0__3_1_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v12, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
; GFX942-NEXT: v_mov_b32_e32 v10, v4
; GFX942-NEXT: v_mov_b32_e32 v11, v5
-; GFX942-NEXT: v_mov_b32_e32 v12, v4
-; GFX942-NEXT: v_mov_b32_e32 v13, v5
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -2733,15 +2717,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2751,15 +2735,15 @@ define void @v_shuffle_v4p0_v2p0__3_3_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -2792,13 +2776,13 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
; GFX90A-NEXT: v_mov_b32_e32 v1, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2808,13 +2792,13 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: v_mov_b32_e32 v1, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -3150,33 +3134,33 @@ define void @v_shuffle_v4p0_v2p0__2_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v2p0__2_3_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p0_v2p0__2_3_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v2
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
@@ -3390,39 +3374,39 @@ define void @v_shuffle_v4p0_v2p0__3_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v2
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_2_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v2
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
index 09e497259766e..e6ac554735eee 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll
@@ -1126,15 +1126,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1144,15 +1144,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -1388,15 +1388,13 @@ define void @v_shuffle_v4p0_v3p0__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1406,15 +1404,13 @@ define void @v_shuffle_v4p0_v3p0__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -3641,33 +3637,33 @@ define void @v_shuffle_v4p0_v3p0__1_2_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_2_2_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
; GFX90A-NEXT: v_mov_b32_e32 v8, v4
; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v4
-; GFX90A-NEXT: v_mov_b32_e32 v11, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p0_v3p0__1_2_2_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v5
; GFX942-NEXT: v_mov_b32_e32 v8, v4
; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v4
-; GFX942-NEXT: v_mov_b32_e32 v11, v5
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -4791,15 +4787,13 @@ define void @v_shuffle_v4p0_v3p0__4_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4809,15 +4803,13 @@ define void @v_shuffle_v4p0_v3p0__4_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -5177,15 +5169,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
; GFX90A-NEXT: v_mov_b32_e32 v8, v0
; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5195,15 +5187,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
; GFX942-NEXT: v_mov_b32_e32 v8, v0
; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -5540,15 +5532,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5558,15 +5550,15 @@ define void @v_shuffle_v4p0_v3p0__5_5_4_3(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -6263,17 +6255,17 @@ define void @v_shuffle_v4p0_v3p0__5_3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
; GFX90A-NEXT: v_mov_b32_e32 v8, v2
; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: v_mov_b32_e32 v10, v2
-; GFX90A-NEXT: v_mov_b32_e32 v11, v3
; GFX90A-NEXT: v_mov_b32_e32 v2, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v5
; GFX90A-NEXT: v_mov_b32_e32 v4, v0
; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6283,17 +6275,17 @@ define void @v_shuffle_v4p0_v3p0__5_3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
; GFX942-NEXT: v_mov_b32_e32 v8, v2
; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: v_mov_b32_e32 v10, v2
-; GFX942-NEXT: v_mov_b32_e32 v11, v3
; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: v_mov_b32_e32 v3, v5
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -6978,33 +6970,33 @@ define void @v_shuffle_v4p0_v3p0__4_5_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p0_v3p0__4_5_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
; GFX90A-NEXT: v_mov_b32_e32 v8, v4
; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v4
-; GFX90A-NEXT: v_mov_b32_e32 v11, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p0_v3p0__4_5_5_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v5
; GFX942-NEXT: v_mov_b32_e32 v8, v4
; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v4
-; GFX942-NEXT: v_mov_b32_e32 v11, v5
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
@@ -7352,15 +7344,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v4
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
; GFX90A-NEXT: v_mov_b32_e32 v8, v4
; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v4
-; GFX90A-NEXT: v_mov_b32_e32 v11, v5
; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7370,15 +7362,15 @@ define void @v_shuffle_v4p0_v3p0__5_4_5_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-NEXT: v_mov_b32_e32 v7, v5
; GFX942-NEXT: v_mov_b32_e32 v8, v4
; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v4
-; GFX942-NEXT: v_mov_b32_e32 v11, v5
; GFX942-NEXT: v_mov_b32_e32 v0, v4
; GFX942-NEXT: v_mov_b32_e32 v1, v5
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll
index 257af574366a6..ce1c54129f706 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll
@@ -8328,15 +8328,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
; GFX90A-NEXT: v_mov_b32_e32 v10, v0
; GFX90A-NEXT: v_mov_b32_e32 v11, v1
-; GFX90A-NEXT: v_mov_b32_e32 v12, v0
-; GFX90A-NEXT: v_mov_b32_e32 v13, v1
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -8346,15 +8346,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v12, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
; GFX942-NEXT: v_mov_b32_e32 v10, v0
; GFX942-NEXT: v_mov_b32_e32 v11, v1
-; GFX942-NEXT: v_mov_b32_e32 v12, v0
-; GFX942-NEXT: v_mov_b32_e32 v13, v1
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr> asm "; def $0", "=v"()
@@ -11254,15 +11254,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_6_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v5
; GFX90A-NEXT: v_mov_b32_e32 v10, v4
; GFX90A-NEXT: v_mov_b32_e32 v11, v5
-; GFX90A-NEXT: v_mov_b32_e32 v12, v4
-; GFX90A-NEXT: v_mov_b32_e32 v13, v5
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: v_mov_b32_e32 v1, v7
-; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -11272,15 +11272,15 @@ define void @v_shuffle_v4p0_v4p0__7_5_6_6(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:7]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v12, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v5
; GFX942-NEXT: v_mov_b32_e32 v10, v4
; GFX942-NEXT: v_mov_b32_e32 v11, v5
-; GFX942-NEXT: v_mov_b32_e32 v12, v4
-; GFX942-NEXT: v_mov_b32_e32 v13, v5
; GFX942-NEXT: v_mov_b32_e32 v0, v6
; GFX942-NEXT: v_mov_b32_e32 v1, v7
-; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll
index 90a1b99dc7c14..3b5690562c38a 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll
@@ -272,27 +272,27 @@ define void @v_shuffle_v4p3_v2p3__3_2_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_2_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v2p3__3_2_u_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2386,29 +2386,28 @@ define void @v_shuffle_v4p3_v2p3__3_3_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v2, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:1]
+; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v3
+; GFX942-NEXT: v_mov_b32_e32 v1, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll
index d13d26f638e0c..8039e126590b9 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll
@@ -255,15 +255,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -271,16 +271,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -414,27 +413,27 @@ define void @v_shuffle_v4p3_v3p3__5_3_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -554,16 +553,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -571,17 +569,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -612,16 +609,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_u(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -629,17 +626,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_u(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v2, v4
-; GFX942-NEXT: v_mov_b32_e32 v3, v4
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -722,29 +719,27 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -771,29 +766,28 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1037,31 +1031,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_5_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1089,31 +1083,28 @@ define void @v_shuffle_v4p3_v3p3__5_5_5_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_5_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1189,29 +1180,28 @@ define void @v_shuffle_v4p3_v3p3__u_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1335,31 +1325,31 @@ define void @v_shuffle_v4p3_v3p3__2_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__2_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1385,29 +1375,28 @@ define void @v_shuffle_v4p3_v3p3__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1499,15 +1488,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1517,15 +1506,15 @@ define void @v_shuffle_v4p3_v3p3__5_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1556,33 +1545,34 @@ define void @v_shuffle_v4p3_v3p3__5_u_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: global_store_dwordx4 v5, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: global_store_dwordx4 v5, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1616,15 +1606,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1634,15 +1624,15 @@ define void @v_shuffle_v4p3_v3p3__5_1_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1674,17 +1664,17 @@ define void @v_shuffle_v4p3_v3p3__5_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1692,17 +1682,17 @@ define void @v_shuffle_v4p3_v3p3__5_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1734,17 +1724,17 @@ define void @v_shuffle_v4p3_v3p3__5_3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v9, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1752,17 +1742,17 @@ define void @v_shuffle_v4p3_v3p3__5_3_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v9, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[6:8]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v8
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1853,15 +1843,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1871,15 +1861,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1911,16 +1901,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1928,16 +1918,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2028,17 +2018,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v6
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2046,17 +2036,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v6
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2088,17 +2078,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v9, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_mov_b32_e32 v2, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2106,17 +2096,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_0(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v9, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[6:8]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v8
+; GFX942-NEXT: v_mov_b32_e32 v1, v8
+; GFX942-NEXT: v_mov_b32_e32 v2, v6
+; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2613,17 +2603,16 @@ define void @v_shuffle_v4p3_v3p3__5_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2631,17 +2620,16 @@ define void @v_shuffle_v4p3_v3p3__5_0_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2673,17 +2661,16 @@ define void @v_shuffle_v4p3_v3p3__5_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2691,17 +2678,17 @@ define void @v_shuffle_v4p3_v3p3__5_2_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2733,17 +2720,16 @@ define void @v_shuffle_v4p3_v3p3__5_3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2751,17 +2737,16 @@ define void @v_shuffle_v4p3_v3p3__5_3_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2849,17 +2834,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2867,17 +2851,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2909,16 +2893,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2926,16 +2909,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2966,17 +2949,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2984,17 +2965,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3026,17 +3006,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v6
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3044,17 +3023,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v6
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3086,17 +3065,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3104,17 +3082,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3146,17 +3124,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3164,17 +3141,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v5
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3607,17 +3584,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3625,17 +3602,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_2_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[6:8]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v8
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3894,16 +3871,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v6
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3911,16 +3888,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v6
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -3952,17 +3929,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_2(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v5, v6
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -3970,17 +3946,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_2(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[6:8]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, v6
-; GFX942-NEXT: v_mov_b32_e32 v5, v6
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v8
+; GFX942-NEXT: v_mov_b32_e32 v1, v8
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4070,37 +4045,36 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:6]
+; GFX90A-NEXT: ; def v[6:8]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v6
-; GFX90A-NEXT: v_mov_b32_e32 v9, v6
-; GFX90A-NEXT: v_mov_b32_e32 v10, v4
-; GFX90A-NEXT: v_mov_b32_e32 v11, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v8
+; GFX90A-NEXT: v_mov_b32_e32 v1, v8
+; GFX90A-NEXT: v_mov_b32_e32 v2, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:6]
+; GFX942-NEXT: ; def v[6:8]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v8, v6
-; GFX942-NEXT: v_mov_b32_e32 v9, v6
-; GFX942-NEXT: v_mov_b32_e32 v10, v4
-; GFX942-NEXT: v_mov_b32_e32 v11, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, v8
+; GFX942-NEXT: v_mov_b32_e32 v1, v8
+; GFX942-NEXT: v_mov_b32_e32 v2, v6
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4384,31 +4358,31 @@ define void @v_shuffle_v4p3_v3p3__5_3_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4436,27 +4410,29 @@ define void @v_shuffle_v4p3_v3p3__5_u_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4487,17 +4463,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4505,17 +4481,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_3_3(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4604,35 +4580,36 @@ define void @v_shuffle_v4p3_v3p3__5_2_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v4
-; GFX90A-NEXT: v_mov_b32_e32 v9, v4
-; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_2_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v4
-; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4660,31 +4637,31 @@ define void @v_shuffle_v4p3_v3p3__5_4_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_4_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_4_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4712,31 +4689,31 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4764,29 +4741,28 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4817,17 +4793,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4835,17 +4810,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_3(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5401,27 +5376,28 @@ define void @v_shuffle_v4p3_v3p3__5_u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v1
-; GFX90A-NEXT: v_mov_b32_e32 v5, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[2:5], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v1
-; GFX942-NEXT: v_mov_b32_e32 v5, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[2:5], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5452,17 +5428,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5470,17 +5446,17 @@ define void @v_shuffle_v4p3_v3p3__5_0_4_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v5
+; GFX942-NEXT: v_mov_b32_e32 v3, v5
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5567,35 +5543,36 @@ define void @v_shuffle_v4p3_v3p3__5_2_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: global_store_dwordx4 v3, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_2_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: global_store_dwordx4 v3, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v2, v5
+; GFX942-NEXT: v_mov_b32_e32 v3, v5
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5622,31 +5599,28 @@ define void @v_shuffle_v4p3_v3p3__5_3_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5674,31 +5648,28 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v1
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v1
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5726,29 +5697,27 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5780,17 +5749,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5798,17 +5766,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v5
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5840,17 +5808,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_4(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -5858,17 +5826,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_4(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v5
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5954,31 +5922,27 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -6544,31 +6508,30 @@ define void @v_shuffle_v4p3_v3p3__5_3_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_5_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -6641,29 +6604,28 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:2]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -6695,17 +6657,16 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:4]
+; GFX90A-NEXT: ; def v[4:6]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v4
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v4
-; GFX90A-NEXT: global_store_dwordx4 v5, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: v_mov_b32_e32 v3, v6
+; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -6713,17 +6674,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[2:4]
+; GFX942-NEXT: ; def v[4:6]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v6, v4
-; GFX942-NEXT: v_mov_b32_e32 v7, v4
-; GFX942-NEXT: v_mov_b32_e32 v9, v4
-; GFX942-NEXT: global_store_dwordx4 v5, v[6:9], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v6
+; GFX942-NEXT: v_mov_b32_e32 v3, v6
+; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -6869,31 +6830,28 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:2]
+; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, v2
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_mov_b32_e32 v6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v7, v2
-; GFX90A-NEXT: global_store_dwordx4 v3, v[4:7], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:2]
+; GFX942-NEXT: ; def v[2:4]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: v_mov_b32_e32 v5, v2
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v7, v2
-; GFX942-NEXT: global_store_dwordx4 v3, v[4:7], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v4
+; GFX942-NEXT: v_mov_b32_e32 v1, v4
+; GFX942-NEXT: v_mov_b32_e32 v3, v4
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll
index 1684b94cfd452..eeab42ae40d7f 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll
@@ -963,29 +963,26 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1012,29 +1009,26 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1446,31 +1440,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_7_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v3
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_7_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v3
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -1740,31 +1734,28 @@ define void @v_shuffle_v4p3_v4p3__2_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_0_0_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__2_0_0_0:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -2794,15 +2785,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v2
-; GFX90A-NEXT: v_mov_b32_e32 v11, v0
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -2812,15 +2802,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_0(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v11, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v2
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4149,15 +4138,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v2
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4167,15 +4155,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v2
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -4209,15 +4196,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_1(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v3
-; GFX90A-NEXT: v_mov_b32_e32 v11, v1
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -4227,15 +4213,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_1(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v11, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v10, v3
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -5463,37 +5448,34 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:3]
+; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[4:7]
+; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v10, v7
-; GFX90A-NEXT: v_mov_b32_e32 v11, v7
-; GFX90A-NEXT: v_mov_b32_e32 v12, v4
-; GFX90A-NEXT: v_mov_b32_e32 v13, v2
-; GFX90A-NEXT: global_store_dwordx4 v8, v[10:13], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v7
+; GFX90A-NEXT: v_mov_b32_e32 v8, v4
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_2:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[0:3]
+; GFX942-NEXT: ; def v[4:7]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
-; GFX942-NEXT: ; def v[4:7]
+; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v10, v7
-; GFX942-NEXT: v_mov_b32_e32 v11, v7
-; GFX942-NEXT: v_mov_b32_e32 v12, v4
-; GFX942-NEXT: v_mov_b32_e32 v13, v2
-; GFX942-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v7
+; GFX942-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -7190,31 +7172,28 @@ define void @v_shuffle_v4p3_v4p3__6_4_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_4_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v2
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__6_4_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v2
-; GFX942-NEXT: v_mov_b32_e32 v7, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -7292,29 +7271,28 @@ define void @v_shuffle_v4p3_v4p3__7_u_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -7581,31 +7559,30 @@ define void @v_shuffle_v4p3_v4p3__7_5_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_5_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v1
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_5_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -7683,31 +7660,28 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -7789,15 +7763,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v2
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -7807,15 +7780,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_4(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v11, v2
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -8069,31 +8041,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_4(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v0
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_6_4:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v0
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -8974,31 +8946,28 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -9078,15 +9047,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -9096,15 +9064,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v11, v3
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -9138,15 +9105,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v1
-; GFX90A-NEXT: v_mov_b32_e32 v11, v3
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -9156,15 +9122,14 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_5(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v1
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v11, v3
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v5
+; GFX942-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -9312,31 +9277,28 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -9365,31 +9327,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_5(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_5:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v2
-; GFX90A-NEXT: v_mov_b32_e32 v9, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_6_5:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v2
-; GFX942-NEXT: v_mov_b32_e32 v9, v1
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -10357,15 +10319,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v8, v5
-; GFX90A-NEXT: v_mov_b32_e32 v9, v5
-; GFX90A-NEXT: v_mov_b32_e32 v10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v11, v4
-; GFX90A-NEXT: global_store_dwordx4 v6, v[8:11], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v6, v5
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v4
+; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -10375,15 +10337,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_6(ptr addrspace(1) inreg %ptr) {
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:5]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v10, v0
-; GFX942-NEXT: v_mov_b32_e32 v8, v5
-; GFX942-NEXT: v_mov_b32_e32 v9, v5
-; GFX942-NEXT: v_mov_b32_e32 v11, v4
-; GFX942-NEXT: global_store_dwordx4 v6, v[8:11], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v5
+; GFX942-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-NEXT: v_mov_b32_e32 v9, v4
+; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -10591,31 +10553,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_6:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_6:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
-; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v2
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -11815,31 +11777,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
@@ -11868,31 +11830,31 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_7(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_7:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v4, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_mov_b32_e32 v8, v1
-; GFX90A-NEXT: v_mov_b32_e32 v9, v3
-; GFX90A-NEXT: global_store_dwordx4 v4, v[6:9], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_7:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:3]
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
; GFX942-NEXT: v_mov_b32_e32 v7, v3
-; GFX942-NEXT: v_mov_b32_e32 v8, v1
-; GFX942-NEXT: v_mov_b32_e32 v9, v3
-; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
index 96b18593ea655..d2008be4fd32a 100644
--- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
+++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
@@ -16,18 +16,19 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b64 s[8:9], src_private_base
-; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_lg_u32 s68, -1
-; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: s_cselect_b32 s5, s9, 0
+; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_cselect_b32 s6, s68, 0
-; CHECK-NEXT: s_add_u32 s50, s34, 48
; CHECK-NEXT: v_mov_b32_e32 v57, s5
; CHECK-NEXT: s_mov_b32 s5, s4
+; CHECK-NEXT: s_add_u32 s50, s34, 48
+; CHECK-NEXT: v_accvgpr_write_b32 a33, s5
; CHECK-NEXT: s_addc_u32 s51, s35, 0
-; CHECK-NEXT: v_pk_mov_b32 v[62:63], s[4:5], s[4:5] op_sel:[0,1]
+; CHECK-NEXT: v_accvgpr_write_b32 a32, s4
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, G at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, G at gotpcrel32@hi+12
@@ -47,13 +48,13 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_mov_b32 s52, s15
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
; CHECK-NEXT: v_mov_b32_e32 v40, v0
-; CHECK-NEXT: v_mov_b32_e32 v60, s66
-; CHECK-NEXT: v_mov_b32_e32 v61, s67
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63]
+; CHECK-NEXT: v_mov_b32_e32 v62, s66
+; CHECK-NEXT: v_mov_b32_e32 v63, s67
+; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33]
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
-; CHECK-NEXT: flat_load_dwordx2 a[32:33], v[58:59]
+; CHECK-NEXT: flat_load_dwordx2 v[60:61], v[58:59]
; CHECK-NEXT: v_mov_b32_e32 v44, 0
; CHECK-NEXT: v_mov_b32_e32 v45, 0x3ff00000
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
@@ -65,7 +66,7 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: flat_store_dwordx2 v[46:47], v[44:45]
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63]
+; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
@@ -74,9 +75,9 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: v_mov_b32_e32 v1, s67
; CHECK-NEXT: v_mov_b32_e32 v0, s68
; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, 0, v42
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], a[32:33]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63]
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 644705e173b52..b045c761436de 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -617,30 +617,30 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
; GFX942-LABEL: v8i8_multi_block:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v1, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 3, v1
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v1
+; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v3
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[6:7], v2, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB11_4
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[4:5], v2, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v1
+; GFX942-NEXT: global_load_dwordx2 v[6:7], v4, s[10:11]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v3
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB11_3
; GFX942-NEXT: ; %bb.2: ; %bb.2
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
-; GFX942-NEXT: global_store_dwordx2 v1, v[6:7], s[12:13]
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[12:13]
; GFX942-NEXT: .LBB11_3: ; %Flow
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: .LBB11_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v0, v[4:5], s[14:15]
+; GFX942-NEXT: global_store_dwordx2 v2, v[6:7], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
>From 4dfc7ab072a2be65797d01e4fef7ced42ba96e5d Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 8 Oct 2025 15:59:23 -0700
Subject: [PATCH 4/4] Control with flag
Change-Id: I45128d10724a59687edda05a6fcd37302bfe7e6d
---
.../Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp | 11 +-
llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll | 499 +++++++++++++++++-
2 files changed, 508 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
index 427922481ecca..b7dbee9c32130 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
@@ -27,6 +27,12 @@ using namespace llvm;
#define DEBUG_TYPE "amdgpu-prepare-agpr-alloc"
+static cl::opt<bool> InflateToAVClass(
+ "amdgpu-avgpr-inflation", cl::Hidden,
+ cl::desc("Whether to inflate register to the avgpr register "
+ "class -- which is assignable to either vgpr or agpr."),
+ cl::init(false));
+
namespace {
class AMDGPUPrepareAGPRAllocImpl {
@@ -122,6 +128,9 @@ bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) {
continue;
}
+ if (!InflateToAVClass)
+ continue;
+
for (MachineOperand &Op : MI.operands()) {
if (!Op.isReg() || !Op.isDef())
continue;
@@ -132,7 +141,7 @@ bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) {
const TargetRegisterClass *RC = MRI.getRegClass(DefReg);
- if (TRI->isAGPRClass(RC) || TRI->isVGPRClass(RC))
+ if (TRI->hasVectorRegisters(RC))
Changed |= MRI.recomputeRegClass(DefReg);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll b/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll
index 3a534149121fb..bf4bf25e6b02a 100644
--- a/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 --greedy-regclass-priority-trumps-globalness=1 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 --greedy-regclass-priority-trumps-globalness=1 --amdgpu-avgpr-inflation < %s | FileCheck -check-prefixes=INFLATE %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 --greedy-regclass-priority-trumps-globalness=1 < %s | FileCheck -check-prefixes=GCN %s
define amdgpu_kernel void @bad_rp(ptr addrspace(3) %in0, ptr addrspace(0) %out, i1 %cond) #0 {
; CHECK-LABEL: bad_rp:
@@ -119,6 +120,331 @@ define amdgpu_kernel void @bad_rp(ptr addrspace(3) %in0, ptr addrspace(0) %out,
; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[4:7] offset:496
; CHECK-NEXT: flat_store_dwordx4 v[168:169], v[0:3] offset:480
; CHECK-NEXT: s_endpgm
+; INFLATE-LABEL: bad_rp:
+; INFLATE: ; %bb.0:
+; INFLATE-NEXT: s_load_dword s0, s[4:5], 0x0
+; INFLATE-NEXT: s_load_dword s1, s[4:5], 0x10
+; INFLATE-NEXT: s_waitcnt lgkmcnt(0)
+; INFLATE-NEXT: v_mov_b32_e32 v0, s0
+; INFLATE-NEXT: ds_read_b128 a[0:3], v0
+; INFLATE-NEXT: ds_read_b128 a[4:7], v0 offset:16
+; INFLATE-NEXT: ds_read_b128 a[8:11], v0 offset:32
+; INFLATE-NEXT: ds_read_b128 a[12:15], v0 offset:48
+; INFLATE-NEXT: ds_read_b128 a[16:19], v0 offset:64
+; INFLATE-NEXT: ds_read_b128 a[20:23], v0 offset:80
+; INFLATE-NEXT: ds_read_b128 a[24:27], v0 offset:96
+; INFLATE-NEXT: ds_read_b128 a[28:31], v0 offset:112
+; INFLATE-NEXT: ds_read_b128 a[32:35], v0 offset:128
+; INFLATE-NEXT: ds_read_b128 a[36:39], v0 offset:144
+; INFLATE-NEXT: ds_read_b128 a[40:43], v0 offset:160
+; INFLATE-NEXT: ds_read_b128 a[44:47], v0 offset:176
+; INFLATE-NEXT: ds_read_b128 a[48:51], v0 offset:192
+; INFLATE-NEXT: ds_read_b128 a[52:55], v0 offset:208
+; INFLATE-NEXT: ds_read_b128 a[56:59], v0 offset:224
+; INFLATE-NEXT: ds_read_b128 a[60:63], v0 offset:240
+; INFLATE-NEXT: s_bitcmp1_b32 s1, 0
+; INFLATE-NEXT: s_cselect_b64 s[0:1], -1, 0
+; INFLATE-NEXT: s_xor_b64 s[0:1], s[0:1], -1
+; INFLATE-NEXT: .LBB0_1: ; %bb.1
+; INFLATE-NEXT: ; =>This Inner Loop Header: Depth=1
+; INFLATE-NEXT: s_waitcnt lgkmcnt(14)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[240:255], a[0:3], a[0:3], 0
+; INFLATE-NEXT: s_andn2_b64 vcc, exec, s[0:1]
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[224:239], a[4:7], a[4:7], v[240:255]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(13)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[208:223], a[8:11], a[8:11], v[224:239]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(12)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[192:207], a[12:15], a[12:15], v[208:223]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(11)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[176:191], a[16:19], a[16:19], v[192:207]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(10)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], a[20:23], a[20:23], v[176:191]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(9)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], a[24:27], a[24:27], v[160:175]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(8)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], a[28:31], a[28:31], v[144:159]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(7)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], a[32:35], a[32:35], v[128:143]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(6)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], a[36:39], a[36:39], v[112:127]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(5)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], a[40:43], a[40:43], v[96:111]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(4)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], a[44:47], a[44:47], v[80:95]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(3)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], a[48:51], a[48:51], v[64:79]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(2)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[32:47], a[52:55], a[52:55], v[48:63]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(1)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], a[56:59], a[56:59], v[32:47]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(0)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], a[60:63], a[60:63], v[16:31]
+; INFLATE-NEXT: s_cbranch_vccnz .LBB0_1
+; INFLATE-NEXT: ; %bb.2: ; %bb.2
+; INFLATE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; INFLATE-NEXT: s_waitcnt lgkmcnt(0)
+; INFLATE-NEXT: v_mov_b64_e32 v[168:169], s[0:1]
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[244:247] offset:16
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[240:243]
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[224:227] offset:32
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[228:231] offset:48
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[208:211] offset:64
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[212:215] offset:80
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[192:195] offset:96
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[196:199] offset:112
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[180:183] offset:144
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[176:179] offset:128
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[160:163] offset:160
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[164:167] offset:176
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[148:151] offset:208
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[156:159] offset:240
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[152:155] offset:224
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[144:147] offset:192
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[140:143] offset:272
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[136:139] offset:256
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[132:135] offset:240
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[128:131] offset:224
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[124:127] offset:304
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[120:123] offset:288
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[116:119] offset:272
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[112:115] offset:256
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[108:111] offset:336
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[104:107] offset:320
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[100:103] offset:304
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[96:99] offset:288
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[92:95] offset:368
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[88:91] offset:352
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[84:87] offset:336
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[80:83] offset:320
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[76:79] offset:400
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[72:75] offset:384
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[68:71] offset:368
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[64:67] offset:352
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[60:63] offset:432
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[56:59] offset:416
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[52:55] offset:400
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[48:51] offset:384
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[44:47] offset:464
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[40:43] offset:448
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[36:39] offset:432
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[32:35] offset:416
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[28:31] offset:496
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[24:27] offset:480
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[20:23] offset:464
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[16:19] offset:448
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[12:15] offset:528
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[8:11] offset:512
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[4:7] offset:496
+; INFLATE-NEXT: flat_store_dwordx4 v[168:169], v[0:3] offset:480
+; INFLATE-NEXT: s_endpgm
+;
+; GCN-LABEL: bad_rp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x0
+; GCN-NEXT: s_load_dword s1, s[4:5], 0x10
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NEXT: ds_read_b128 a[48:51], v12
+; GCN-NEXT: ds_read_b128 a[52:55], v12 offset:16
+; GCN-NEXT: ds_read_b128 a[56:59], v12 offset:32
+; GCN-NEXT: ds_read_b128 a[60:63], v12 offset:48
+; GCN-NEXT: ds_read_b128 a[64:67], v12 offset:64
+; GCN-NEXT: ds_read_b128 a[68:71], v12 offset:80
+; GCN-NEXT: ds_read_b128 a[72:75], v12 offset:96
+; GCN-NEXT: ds_read_b128 a[76:79], v12 offset:112
+; GCN-NEXT: ds_read_b128 v[0:3], v12 offset:128
+; GCN-NEXT: ds_read_b128 v[4:7], v12 offset:144
+; GCN-NEXT: ds_read_b128 v[8:11], v12 offset:160
+; GCN-NEXT: ds_read_b128 v[32:35], v12 offset:176
+; GCN-NEXT: ds_read_b128 v[36:39], v12 offset:192
+; GCN-NEXT: ds_read_b128 v[40:43], v12 offset:208
+; GCN-NEXT: ds_read_b128 v[44:47], v12 offset:224
+; GCN-NEXT: ds_read_b128 v[12:15], v12 offset:240
+; GCN-NEXT: s_bitcmp1_b32 s1, 0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1
+; GCN-NEXT: .LBB0_1: ; %bb.1
+; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT: s_waitcnt lgkmcnt(14)
+; GCN-NEXT: s_nop 9
+; GCN-NEXT: v_accvgpr_read_b32 v16, a48
+; GCN-NEXT: v_accvgpr_read_b32 v17, a49
+; GCN-NEXT: v_accvgpr_read_b32 v18, a50
+; GCN-NEXT: v_accvgpr_read_b32 v19, a51
+; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[240:255], v[16:19], v[16:19], 0
+; GCN-NEXT: v_accvgpr_read_b32 v16, a52
+; GCN-NEXT: v_accvgpr_read_b32 v17, a53
+; GCN-NEXT: v_accvgpr_read_b32 v18, a54
+; GCN-NEXT: v_accvgpr_read_b32 v19, a55
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[224:239], v[16:19], v[16:19], v[240:255]
+; GCN-NEXT: s_waitcnt lgkmcnt(13)
+; GCN-NEXT: v_accvgpr_read_b32 v16, a56
+; GCN-NEXT: v_accvgpr_read_b32 v17, a57
+; GCN-NEXT: v_accvgpr_read_b32 v18, a58
+; GCN-NEXT: v_accvgpr_read_b32 v19, a59
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[208:223], v[16:19], v[16:19], v[224:239]
+; GCN-NEXT: s_waitcnt lgkmcnt(12)
+; GCN-NEXT: v_accvgpr_read_b32 v16, a60
+; GCN-NEXT: v_accvgpr_read_b32 v17, a61
+; GCN-NEXT: v_accvgpr_read_b32 v18, a62
+; GCN-NEXT: v_accvgpr_read_b32 v19, a63
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[192:207], v[16:19], v[16:19], v[208:223]
+; GCN-NEXT: s_waitcnt lgkmcnt(11)
+; GCN-NEXT: v_accvgpr_read_b32 v16, a64
+; GCN-NEXT: v_accvgpr_read_b32 v17, a65
+; GCN-NEXT: v_accvgpr_read_b32 v18, a66
+; GCN-NEXT: v_accvgpr_read_b32 v19, a67
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[176:191], v[16:19], v[16:19], v[192:207]
+; GCN-NEXT: s_waitcnt lgkmcnt(10)
+; GCN-NEXT: v_accvgpr_read_b32 v16, a68
+; GCN-NEXT: v_accvgpr_read_b32 v17, a69
+; GCN-NEXT: v_accvgpr_read_b32 v18, a70
+; GCN-NEXT: v_accvgpr_read_b32 v19, a71
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], v[16:19], v[16:19], v[176:191]
+; GCN-NEXT: s_waitcnt lgkmcnt(9)
+; GCN-NEXT: v_accvgpr_read_b32 v16, a72
+; GCN-NEXT: v_accvgpr_read_b32 v17, a73
+; GCN-NEXT: v_accvgpr_read_b32 v18, a74
+; GCN-NEXT: v_accvgpr_read_b32 v19, a75
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], v[16:19], v[16:19], v[160:175]
+; GCN-NEXT: s_waitcnt lgkmcnt(8)
+; GCN-NEXT: v_accvgpr_read_b32 v16, a76
+; GCN-NEXT: v_accvgpr_read_b32 v17, a77
+; GCN-NEXT: v_accvgpr_read_b32 v18, a78
+; GCN-NEXT: v_accvgpr_read_b32 v19, a79
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], v[16:19], v[16:19], v[144:159]
+; GCN-NEXT: s_waitcnt lgkmcnt(7)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], v[0:3], v[0:3], v[128:143]
+; GCN-NEXT: s_waitcnt lgkmcnt(6)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], v[4:7], v[4:7], v[112:127]
+; GCN-NEXT: s_waitcnt lgkmcnt(5)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], v[8:11], v[8:11], v[96:111]
+; GCN-NEXT: s_waitcnt lgkmcnt(4)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], v[32:35], v[32:35], v[80:95]
+; GCN-NEXT: s_waitcnt lgkmcnt(3)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], v[36:39], v[36:39], v[64:79]
+; GCN-NEXT: s_waitcnt lgkmcnt(2)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[40:43], v[40:43], v[48:63]
+; GCN-NEXT: s_nop 9
+; GCN-NEXT: v_accvgpr_write_b32 a32, v48
+; GCN-NEXT: v_accvgpr_write_b32 a33, v49
+; GCN-NEXT: v_accvgpr_write_b32 a34, v50
+; GCN-NEXT: v_accvgpr_write_b32 a35, v51
+; GCN-NEXT: v_accvgpr_write_b32 a36, v52
+; GCN-NEXT: v_accvgpr_write_b32 a37, v53
+; GCN-NEXT: v_accvgpr_write_b32 a38, v54
+; GCN-NEXT: v_accvgpr_write_b32 a39, v55
+; GCN-NEXT: v_accvgpr_write_b32 a40, v56
+; GCN-NEXT: v_accvgpr_write_b32 a41, v57
+; GCN-NEXT: v_accvgpr_write_b32 a42, v58
+; GCN-NEXT: v_accvgpr_write_b32 a43, v59
+; GCN-NEXT: v_accvgpr_write_b32 a44, v60
+; GCN-NEXT: v_accvgpr_write_b32 a45, v61
+; GCN-NEXT: v_accvgpr_write_b32 a46, v62
+; GCN-NEXT: v_accvgpr_write_b32 a47, v63
+; GCN-NEXT: s_waitcnt lgkmcnt(1)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], v[44:47], v[44:47], v[16:31]
+; GCN-NEXT: v_accvgpr_write_b32 a16, v16
+; GCN-NEXT: v_accvgpr_write_b32 a17, v17
+; GCN-NEXT: v_accvgpr_write_b32 a18, v18
+; GCN-NEXT: v_accvgpr_write_b32 a19, v19
+; GCN-NEXT: v_accvgpr_write_b32 a20, v20
+; GCN-NEXT: v_accvgpr_write_b32 a21, v21
+; GCN-NEXT: v_accvgpr_write_b32 a22, v22
+; GCN-NEXT: v_accvgpr_write_b32 a23, v23
+; GCN-NEXT: v_accvgpr_write_b32 a24, v24
+; GCN-NEXT: v_accvgpr_write_b32 a25, v25
+; GCN-NEXT: v_accvgpr_write_b32 a26, v26
+; GCN-NEXT: v_accvgpr_write_b32 a27, v27
+; GCN-NEXT: v_accvgpr_write_b32 a28, v28
+; GCN-NEXT: v_accvgpr_write_b32 a29, v29
+; GCN-NEXT: v_accvgpr_write_b32 a30, v30
+; GCN-NEXT: v_accvgpr_write_b32 a31, v31
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[12:15], v[12:15], v[48:63]
+; GCN-NEXT: s_cbranch_vccnz .LBB0_1
+; GCN-NEXT: ; %bb.2: ; %bb.2
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GCN-NEXT: v_accvgpr_write_b32 a0, v48
+; GCN-NEXT: v_accvgpr_write_b32 a1, v49
+; GCN-NEXT: v_accvgpr_write_b32 a2, v50
+; GCN-NEXT: v_accvgpr_write_b32 a3, v51
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GCN-NEXT: v_accvgpr_write_b32 a4, v52
+; GCN-NEXT: v_accvgpr_write_b32 a5, v53
+; GCN-NEXT: v_accvgpr_write_b32 a6, v54
+; GCN-NEXT: v_accvgpr_write_b32 a7, v55
+; GCN-NEXT: v_accvgpr_write_b32 a8, v56
+; GCN-NEXT: v_accvgpr_write_b32 a9, v57
+; GCN-NEXT: v_accvgpr_write_b32 a10, v58
+; GCN-NEXT: v_accvgpr_write_b32 a11, v59
+; GCN-NEXT: v_accvgpr_write_b32 a12, v60
+; GCN-NEXT: v_accvgpr_write_b32 a13, v61
+; GCN-NEXT: v_accvgpr_write_b32 a14, v62
+; GCN-NEXT: v_accvgpr_write_b32 a15, v63
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[244:247] offset:16
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[240:243]
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[224:227] offset:32
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[228:231] offset:48
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[208:211] offset:64
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[212:215] offset:80
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[192:195] offset:96
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[196:199] offset:112
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[180:183] offset:144
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[176:179] offset:128
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[160:163] offset:160
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[164:167] offset:176
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[148:151] offset:208
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[156:159] offset:240
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[152:155] offset:224
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[144:147] offset:192
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[140:143] offset:272
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[136:139] offset:256
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[132:135] offset:240
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[128:131] offset:224
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[124:127] offset:304
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[120:123] offset:288
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[116:119] offset:272
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[112:115] offset:256
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[108:111] offset:336
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[104:107] offset:320
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[100:103] offset:304
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[96:99] offset:288
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[92:95] offset:368
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[88:91] offset:352
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[84:87] offset:336
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[80:83] offset:320
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[76:79] offset:400
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[72:75] offset:384
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[68:71] offset:368
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:352
+; GCN-NEXT: flat_store_dwordx4 v[0:1], a[44:47] offset:432
+; GCN-NEXT: flat_store_dwordx4 v[0:1], a[40:43] offset:416
+; GCN-NEXT: flat_store_dwordx4 v[0:1], a[36:39] offset:400
+; GCN-NEXT: flat_store_dwordx4 v[0:1], a[32:35] offset:384
+; GCN-NEXT: flat_store_dwordx4 v[0:1], a[28:31] offset:464
+; GCN-NEXT: flat_store_dwordx4 v[0:1], a[24:27] offset:448
+; GCN-NEXT: flat_store_dwordx4 v[0:1], a[20:23] offset:432
+; GCN-NEXT: flat_store_dwordx4 v[0:1], a[16:19] offset:416
+; GCN-NEXT: flat_store_dwordx4 v[0:1], a[12:15] offset:496
+; GCN-NEXT: flat_store_dwordx4 v[0:1], a[8:11] offset:480
+; GCN-NEXT: flat_store_dwordx4 v[0:1], a[4:7] offset:464
+; GCN-NEXT: flat_store_dwordx4 v[0:1], a[0:3] offset:448
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[28:31] offset:528
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[24:27] offset:512
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[20:23] offset:496
+; GCN-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:480
+; GCN-NEXT: s_endpgm
%gep1 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 4
%gep2 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 8
%gep3 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 12
@@ -292,6 +618,177 @@ define amdgpu_kernel void @good_rp(ptr addrspace(3) %in0, ptr addrspace(0) %out,
; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[4:7] offset:336
; CHECK-NEXT: flat_store_dwordx4 v[88:89], v[0:3] offset:320
; CHECK-NEXT: s_endpgm
+; INFLATE-LABEL: good_rp:
+; INFLATE: ; %bb.0:
+; INFLATE-NEXT: s_load_dword s0, s[4:5], 0x10
+; INFLATE-NEXT: s_load_dword s1, s[4:5], 0x0
+; INFLATE-NEXT: s_waitcnt lgkmcnt(0)
+; INFLATE-NEXT: s_bitcmp1_b32 s0, 0
+; INFLATE-NEXT: v_mov_b32_e32 v0, s1
+; INFLATE-NEXT: ds_read_b128 v[176:179], v0
+; INFLATE-NEXT: ds_read_b128 v[180:183], v0 offset:16
+; INFLATE-NEXT: ds_read_b128 v[184:187], v0 offset:32
+; INFLATE-NEXT: ds_read_b128 v[188:191], v0 offset:48
+; INFLATE-NEXT: ds_read_b128 v[192:195], v0 offset:64
+; INFLATE-NEXT: ds_read_b128 v[196:199], v0 offset:80
+; INFLATE-NEXT: ds_read_b128 v[200:203], v0 offset:96
+; INFLATE-NEXT: ds_read_b128 v[204:207], v0 offset:112
+; INFLATE-NEXT: ds_read_b128 v[208:211], v0 offset:128
+; INFLATE-NEXT: ds_read_b128 v[212:215], v0 offset:144
+; INFLATE-NEXT: ds_read_b128 v[216:219], v0 offset:160
+; INFLATE-NEXT: s_cselect_b64 s[0:1], -1, 0
+; INFLATE-NEXT: s_xor_b64 s[0:1], s[0:1], -1
+; INFLATE-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; INFLATE-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
+; INFLATE-NEXT: .LBB1_1: ; %bb.1
+; INFLATE-NEXT: ; =>This Inner Loop Header: Depth=1
+; INFLATE-NEXT: s_waitcnt lgkmcnt(10)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], v[176:179], v[176:179], 0
+; INFLATE-NEXT: s_and_b64 vcc, exec, s[0:1]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(9)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], v[180:183], v[180:183], v[160:175]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(8)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], v[184:187], v[184:187], v[144:159]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(7)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], v[188:191], v[188:191], v[128:143]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(6)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], v[192:195], v[192:195], v[112:127]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(5)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], v[196:199], v[196:199], v[96:111]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(4)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], v[200:203], v[200:203], v[80:95]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(3)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], v[204:207], v[204:207], v[64:79]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(2)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[32:47], v[208:211], v[208:211], v[48:63]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(1)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[212:215], v[212:215], v[32:47]
+; INFLATE-NEXT: s_waitcnt lgkmcnt(0)
+; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[216:219], v[216:219], v[16:31]
+; INFLATE-NEXT: s_cbranch_vccnz .LBB1_1
+; INFLATE-NEXT: ; %bb.2: ; %bb.2
+; INFLATE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; INFLATE-NEXT: s_waitcnt lgkmcnt(0)
+; INFLATE-NEXT: v_mov_b64_e32 v[88:89], s[0:1]
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[164:167] offset:16
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[160:163]
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[144:147] offset:32
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[148:151] offset:48
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[128:131] offset:64
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[132:135] offset:80
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[112:115] offset:96
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[116:119] offset:112
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[100:103] offset:144
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[96:99] offset:128
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[80:83] offset:160
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[84:87] offset:176
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[68:71] offset:208
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[76:79] offset:240
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[72:75] offset:224
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[64:67] offset:192
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[60:63] offset:272
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[56:59] offset:256
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[52:55] offset:240
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[48:51] offset:224
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[44:47] offset:304
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[40:43] offset:288
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[36:39] offset:272
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[32:35] offset:256
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[28:31] offset:336
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[24:27] offset:320
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[20:23] offset:304
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[16:19] offset:288
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[12:15] offset:368
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[8:11] offset:352
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[4:7] offset:336
+; INFLATE-NEXT: flat_store_dwordx4 v[88:89], v[0:3] offset:320
+; INFLATE-NEXT: s_endpgm
+;
+; GCN-LABEL: good_rp:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x10
+; GCN-NEXT: s_load_dword s1, s[4:5], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_bitcmp1_b32 s0, 0
+; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: ds_read_b128 v[176:179], v0
+; GCN-NEXT: ds_read_b128 v[180:183], v0 offset:16
+; GCN-NEXT: ds_read_b128 v[184:187], v0 offset:32
+; GCN-NEXT: ds_read_b128 v[188:191], v0 offset:48
+; GCN-NEXT: ds_read_b128 v[192:195], v0 offset:64
+; GCN-NEXT: ds_read_b128 v[196:199], v0 offset:80
+; GCN-NEXT: ds_read_b128 v[200:203], v0 offset:96
+; GCN-NEXT: ds_read_b128 v[204:207], v0 offset:112
+; GCN-NEXT: ds_read_b128 v[208:211], v0 offset:128
+; GCN-NEXT: ds_read_b128 v[212:215], v0 offset:144
+; GCN-NEXT: ds_read_b128 v[216:219], v0 offset:160
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GCN-NEXT: .LBB1_1: ; %bb.1
+; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT: s_waitcnt lgkmcnt(10)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[160:175], v[176:179], v[176:179], 0
+; GCN-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GCN-NEXT: s_waitcnt lgkmcnt(9)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[144:159], v[180:183], v[180:183], v[160:175]
+; GCN-NEXT: s_waitcnt lgkmcnt(8)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[128:143], v[184:187], v[184:187], v[144:159]
+; GCN-NEXT: s_waitcnt lgkmcnt(7)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[112:127], v[188:191], v[188:191], v[128:143]
+; GCN-NEXT: s_waitcnt lgkmcnt(6)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[96:111], v[192:195], v[192:195], v[112:127]
+; GCN-NEXT: s_waitcnt lgkmcnt(5)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[80:95], v[196:199], v[196:199], v[96:111]
+; GCN-NEXT: s_waitcnt lgkmcnt(4)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[64:79], v[200:203], v[200:203], v[80:95]
+; GCN-NEXT: s_waitcnt lgkmcnt(3)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[48:63], v[204:207], v[204:207], v[64:79]
+; GCN-NEXT: s_waitcnt lgkmcnt(2)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[32:47], v[208:211], v[208:211], v[48:63]
+; GCN-NEXT: s_waitcnt lgkmcnt(1)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[212:215], v[212:215], v[32:47]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[216:219], v[216:219], v[16:31]
+; GCN-NEXT: s_cbranch_vccnz .LBB1_1
+; GCN-NEXT: ; %bb.2: ; %bb.2
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b64_e32 v[88:89], s[0:1]
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[164:167] offset:16
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[160:163]
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[144:147] offset:32
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[148:151] offset:48
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[128:131] offset:64
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[132:135] offset:80
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[112:115] offset:96
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[116:119] offset:112
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[100:103] offset:144
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[96:99] offset:128
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[80:83] offset:160
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[84:87] offset:176
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[68:71] offset:208
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[76:79] offset:240
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[72:75] offset:224
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[64:67] offset:192
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[60:63] offset:272
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[56:59] offset:256
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[52:55] offset:240
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[48:51] offset:224
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[44:47] offset:304
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[40:43] offset:288
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[36:39] offset:272
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[32:35] offset:256
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[28:31] offset:336
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[24:27] offset:320
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[20:23] offset:304
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[16:19] offset:288
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[12:15] offset:368
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[8:11] offset:352
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[4:7] offset:336
+; GCN-NEXT: flat_store_dwordx4 v[88:89], v[0:3] offset:320
+; GCN-NEXT: s_endpgm
%gep1 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 4
%gep2 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 8
%gep3 = getelementptr ptr addrspace(3), ptr addrspace(3) %in0, i32 12
More information about the llvm-commits
mailing list