[llvm] b79a665 - [AMDGPU] Remove leftover implicit operands from SI_SPILL/SI_RESTORE. (#168546)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 19 07:02:08 PST 2025
Author: LU-JOHN
Date: 2025-11-19T09:02:03-06:00
New Revision: b79a665f7170fbb631b13175ec747ccfd779bf9e
URL: https://github.com/llvm/llvm-project/commit/b79a665f7170fbb631b13175ec747ccfd779bf9e
DIFF: https://github.com/llvm/llvm-project/commit/b79a665f7170fbb631b13175ec747ccfd779bf9e.diff
LOG: [AMDGPU] Remove leftover implicit operands from SI_SPILL/SI_RESTORE. (#168546)
Remove leftover implicit operands from SI_SPILL/SI_RESTORE.
---------
Signed-off-by: John Lu <John.Lu at amd.com>
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir
llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7cb7f47ddb220..630fdc8e8891c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2094,11 +2094,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
break;
case AMDGPU::SI_SPILL_S32_TO_VGPR:
- MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
+ mutateAndCleanupImplicit(MI, get(AMDGPU::V_WRITELANE_B32));
break;
case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
- MI.setDesc(get(AMDGPU::V_READLANE_B32));
+ mutateAndCleanupImplicit(MI, get(AMDGPU::V_READLANE_B32));
break;
case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
Register Dst = MI.getOperand(0).getReg();
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 8879ef5c8265d..d965a3dbcc8a4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -8181,8 +8181,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s18
; SI-NEXT: v_readlane_b32 s18, v23, 0
; SI-NEXT: s_and_b32 s16, s16, 0xff
-; SI-NEXT: v_readlane_b32 s19, v23, 1
; SI-NEXT: s_lshl_b32 s18, s18, 8
+; SI-NEXT: v_readlane_b32 s19, v23, 1
; SI-NEXT: s_or_b32 s16, s16, s18
; SI-NEXT: v_readlane_b32 s18, v23, 2
; SI-NEXT: v_readlane_b32 s19, v23, 3
@@ -8215,8 +8215,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s16
; SI-NEXT: v_readlane_b32 s16, v23, 6
; SI-NEXT: s_and_b32 s14, s14, 0xff
-; SI-NEXT: v_readlane_b32 s17, v23, 7
; SI-NEXT: s_lshl_b32 s16, s16, 8
+; SI-NEXT: v_readlane_b32 s17, v23, 7
; SI-NEXT: s_or_b32 s14, s14, s16
; SI-NEXT: v_readlane_b32 s16, v23, 8
; SI-NEXT: v_readlane_b32 s17, v23, 9
@@ -8249,8 +8249,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s14
; SI-NEXT: v_readlane_b32 s14, v23, 12
; SI-NEXT: s_and_b32 s12, s12, 0xff
-; SI-NEXT: v_readlane_b32 s15, v23, 13
; SI-NEXT: s_lshl_b32 s14, s14, 8
+; SI-NEXT: v_readlane_b32 s15, v23, 13
; SI-NEXT: s_or_b32 s12, s12, s14
; SI-NEXT: v_readlane_b32 s14, v23, 14
; SI-NEXT: v_readlane_b32 s15, v23, 15
@@ -8283,8 +8283,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s12
; SI-NEXT: v_readlane_b32 s12, v23, 18
; SI-NEXT: s_and_b32 s10, s10, 0xff
-; SI-NEXT: v_readlane_b32 s13, v23, 19
; SI-NEXT: s_lshl_b32 s12, s12, 8
+; SI-NEXT: v_readlane_b32 s13, v23, 19
; SI-NEXT: s_or_b32 s10, s10, s12
; SI-NEXT: v_readlane_b32 s12, v23, 20
; SI-NEXT: v_readlane_b32 s13, v23, 21
@@ -8317,8 +8317,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s10
; SI-NEXT: v_readlane_b32 s10, v23, 24
; SI-NEXT: s_and_b32 s8, s8, 0xff
-; SI-NEXT: v_readlane_b32 s11, v23, 25
; SI-NEXT: s_lshl_b32 s10, s10, 8
+; SI-NEXT: v_readlane_b32 s11, v23, 25
; SI-NEXT: s_or_b32 s8, s8, s10
; SI-NEXT: v_readlane_b32 s10, v23, 26
; SI-NEXT: v_readlane_b32 s11, v23, 27
@@ -8350,8 +8350,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s8
; SI-NEXT: v_readlane_b32 s8, v23, 30
; SI-NEXT: s_and_b32 s6, s6, 0xff
-; SI-NEXT: v_readlane_b32 s9, v23, 31
; SI-NEXT: s_lshl_b32 s8, s8, 8
+; SI-NEXT: v_readlane_b32 s9, v23, 31
; SI-NEXT: s_or_b32 s6, s6, s8
; SI-NEXT: v_readlane_b32 s8, v23, 32
; SI-NEXT: v_readlane_b32 s9, v23, 33
@@ -8384,8 +8384,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: v_readlane_b32 s6, v23, 36
; SI-NEXT: s_and_b32 s4, s4, 0xff
-; SI-NEXT: v_readlane_b32 s7, v23, 37
; SI-NEXT: s_lshl_b32 s6, s6, 8
+; SI-NEXT: v_readlane_b32 s7, v23, 37
; SI-NEXT: s_or_b32 s4, s4, s6
; SI-NEXT: v_readlane_b32 s6, v23, 38
; SI-NEXT: v_readlane_b32 s7, v23, 39
@@ -8468,148 +8468,149 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB13_4:
-; SI-NEXT: ; implicit-def: $sgpr51
; SI-NEXT: ; implicit-def: $sgpr50
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v23, s50, 0
-; SI-NEXT: v_writelane_b32 v23, s51, 1
-; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: ; implicit-def: $sgpr51
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 1
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 2
-; SI-NEXT: v_writelane_b32 v23, s51, 3
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 2
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 3
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 4
-; SI-NEXT: v_writelane_b32 v23, s51, 5
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 4
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 5
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 6
-; SI-NEXT: v_writelane_b32 v23, s51, 7
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 6
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 7
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 8
-; SI-NEXT: v_writelane_b32 v23, s51, 9
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 8
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 9
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 10
-; SI-NEXT: v_writelane_b32 v23, s51, 11
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 10
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 11
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 12
-; SI-NEXT: v_writelane_b32 v23, s51, 13
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 12
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 13
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 14
-; SI-NEXT: v_writelane_b32 v23, s51, 15
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 14
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 15
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 16
-; SI-NEXT: v_writelane_b32 v23, s51, 17
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 16
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 17
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 18
-; SI-NEXT: v_writelane_b32 v23, s51, 19
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 18
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 19
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 20
-; SI-NEXT: v_writelane_b32 v23, s51, 21
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 20
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 21
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 22
-; SI-NEXT: v_writelane_b32 v23, s51, 23
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 22
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 23
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 24
-; SI-NEXT: v_writelane_b32 v23, s51, 25
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 24
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 25
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 26
-; SI-NEXT: v_writelane_b32 v23, s51, 27
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 27
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 28
-; SI-NEXT: v_writelane_b32 v23, s51, 29
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 28
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 29
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 30
-; SI-NEXT: v_writelane_b32 v23, s51, 31
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 30
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 31
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 32
-; SI-NEXT: v_writelane_b32 v23, s51, 33
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 32
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 33
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 34
-; SI-NEXT: v_writelane_b32 v23, s51, 35
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 34
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 35
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 36
-; SI-NEXT: v_writelane_b32 v23, s51, 37
; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 36
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s51, 37
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr50
; SI-NEXT: v_writelane_b32 v23, s50, 38
+; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: v_writelane_b32 v23, s51, 39
-; SI-NEXT: ; implicit-def: $sgpr50
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: v_writelane_b32 v23, s50, 40
; SI-NEXT: ; implicit-def: $sgpr49
; SI-NEXT: ; implicit-def: $sgpr55
; SI-NEXT: ; implicit-def: $sgpr54
@@ -8634,7 +8635,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: ; implicit-def: $sgpr58
; SI-NEXT: ; implicit-def: $sgpr28
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s50, 40
; SI-NEXT: ; implicit-def: $sgpr98
; SI-NEXT: ; implicit-def: $sgpr96
; SI-NEXT: ; implicit-def: $sgpr86
@@ -10597,10 +10597,9 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX9-NEXT: .LBB13_4:
; GFX9-NEXT: ; implicit-def: $sgpr27
; GFX9-NEXT: ; kill: killed $sgpr27
-; GFX9-NEXT: ; implicit-def: $sgpr83
-; GFX9-NEXT: ; implicit-def: $sgpr82
; GFX9-NEXT: ; implicit-def: $sgpr27
; GFX9-NEXT: ; kill: killed $sgpr27
+; GFX9-NEXT: ; implicit-def: $sgpr82
; GFX9-NEXT: v_writelane_b32 v22, s82, 0
; GFX9-NEXT: ; implicit-def: $sgpr27
; GFX9-NEXT: ; kill: killed $sgpr27
@@ -10633,6 +10632,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX9-NEXT: ; implicit-def: $sgpr85
; GFX9-NEXT: ; implicit-def: $sgpr84
; GFX9-NEXT: ; implicit-def: $sgpr81
+; GFX9-NEXT: ; implicit-def: $sgpr83
; GFX9-NEXT: ; implicit-def: $sgpr36
; GFX9-NEXT: ; implicit-def: $sgpr34
; GFX9-NEXT: ; implicit-def: $sgpr30
@@ -10985,12 +10985,10 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24
; GFX11-NEXT: s_branch .LBB13_3
; GFX11-NEXT: .LBB13_2:
-; GFX11-NEXT: ; implicit-def: $vcc_hi
; GFX11-NEXT: ; implicit-def: $vcc_lo
-; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: ; kill: killed $sgpr42
-; GFX11-NEXT: s_mov_b32 s101, -1
; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 0
+; GFX11-NEXT: ; implicit-def: $vcc_hi
+; GFX11-NEXT: ; implicit-def: $vcc_lo
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
@@ -11001,7 +10999,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 1
-; GFX11-NEXT: ; implicit-def: $vcc_lo
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
@@ -11013,6 +11010,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 2
+; GFX11-NEXT: ; implicit-def: $vcc_lo
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
@@ -11024,7 +11022,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 3
-; GFX11-NEXT: ; implicit-def: $vcc_lo
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
@@ -11036,6 +11033,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 4
+; GFX11-NEXT: ; implicit-def: $vcc_lo
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
@@ -11044,10 +11042,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
+; GFX11-NEXT: s_mov_b32 s101, -1
+; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5
+; GFX11-NEXT: ; kill: killed $sgpr42
+; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5
-; GFX11-NEXT: ; implicit-def: $vcc_lo
; GFX11-NEXT: ; implicit-def: $sgpr45
; GFX11-NEXT: ; implicit-def: $sgpr44
; GFX11-NEXT: ; implicit-def: $sgpr30
@@ -11111,17 +11111,17 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX11-NEXT: ; implicit-def: $sgpr88
; GFX11-NEXT: ; implicit-def: $sgpr76
; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 6
-; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7
; GFX11-NEXT: ; implicit-def: $vcc_lo
+; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7
; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 8
-; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9
; GFX11-NEXT: ; implicit-def: $vcc_lo
+; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9
; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 10
-; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11
; GFX11-NEXT: ; implicit-def: $vcc_lo
+; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11
; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 12
-; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13
; GFX11-NEXT: ; implicit-def: $vcc_lo
+; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13
; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 14
; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 15
; GFX11-NEXT: .LBB13_3: ; %Flow
@@ -45248,147 +45248,149 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v37
; SI-NEXT: s_branch .LBB37_5
; SI-NEXT: .LBB37_3:
-; SI-NEXT: ; implicit-def: $sgpr61
; SI-NEXT: ; implicit-def: $sgpr60
-; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: v_writelane_b32 v61, s60, 0
+; SI-NEXT: ; implicit-def: $sgpr61
; SI-NEXT: v_writelane_b32 v61, s61, 1
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 2
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 2
; SI-NEXT: v_writelane_b32 v61, s61, 3
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 4
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 4
; SI-NEXT: v_writelane_b32 v61, s61, 5
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 6
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 6
; SI-NEXT: v_writelane_b32 v61, s61, 7
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 8
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 8
; SI-NEXT: v_writelane_b32 v61, s61, 9
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 10
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 10
; SI-NEXT: v_writelane_b32 v61, s61, 11
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 12
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 12
; SI-NEXT: v_writelane_b32 v61, s61, 13
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 14
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 14
; SI-NEXT: v_writelane_b32 v61, s61, 15
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 16
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 16
; SI-NEXT: v_writelane_b32 v61, s61, 17
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 18
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 18
; SI-NEXT: v_writelane_b32 v61, s61, 19
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 20
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 20
; SI-NEXT: v_writelane_b32 v61, s61, 21
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 22
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 22
; SI-NEXT: v_writelane_b32 v61, s61, 23
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 24
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 24
; SI-NEXT: v_writelane_b32 v61, s61, 25
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 26
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 26
; SI-NEXT: v_writelane_b32 v61, s61, 27
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 28
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 28
; SI-NEXT: v_writelane_b32 v61, s61, 29
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 30
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 30
; SI-NEXT: v_writelane_b32 v61, s61, 31
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 32
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 32
; SI-NEXT: v_writelane_b32 v61, s61, 33
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 34
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 34
; SI-NEXT: v_writelane_b32 v61, s61, 35
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 36
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 36
; SI-NEXT: v_writelane_b32 v61, s61, 37
-; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 38
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 38
; SI-NEXT: v_writelane_b32 v61, s61, 39
+; SI-NEXT: ; kill: killed $sgpr4
+; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: v_writelane_b32 v61, s60, 40
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: v_writelane_b32 v61, s61, 41
; SI-NEXT: ; implicit-def: $sgpr97
; SI-NEXT: ; implicit-def: $sgpr96
; SI-NEXT: ; implicit-def: $sgpr28
@@ -45397,8 +45399,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: ; implicit-def: $sgpr27
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v61, s60, 40
-; SI-NEXT: v_writelane_b32 v61, s61, 41
; SI-NEXT: ; implicit-def: $sgpr98
; SI-NEXT: ; implicit-def: $sgpr86
; SI-NEXT: ; implicit-def: $sgpr84
@@ -45570,6 +45570,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_mov_b32_e32 v55, s97
; SI-NEXT: v_mov_b32_e32 v54, s96
; SI-NEXT: v_mov_b32_e32 v52, s60
; SI-NEXT: v_mov_b32_e32 v47, s28
@@ -45590,48 +45591,47 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
; SI-NEXT: v_mov_b32_e32 v31, s46
; SI-NEXT: v_mov_b32_e32 v36, s56
; SI-NEXT: v_readlane_b32 s26, v61, 40
+; SI-NEXT: v_readlane_b32 s27, v61, 41
; SI-NEXT: v_readlane_b32 s28, v61, 38
+; SI-NEXT: v_readlane_b32 s29, v61, 39
; SI-NEXT: v_readlane_b32 s6, v61, 36
+; SI-NEXT: v_readlane_b32 s7, v61, 37
; SI-NEXT: v_readlane_b32 s58, v61, 34
+; SI-NEXT: v_readlane_b32 s59, v61, 35
; SI-NEXT: v_readlane_b32 s60, v61, 32
+; SI-NEXT: v_readlane_b32 s61, v61, 33
; SI-NEXT: v_readlane_b32 s8, v61, 30
+; SI-NEXT: v_readlane_b32 s9, v61, 31
; SI-NEXT: v_readlane_b32 s10, v61, 28
+; SI-NEXT: v_readlane_b32 s11, v61, 29
; SI-NEXT: v_readlane_b32 s12, v61, 26
+; SI-NEXT: v_readlane_b32 s13, v61, 27
; SI-NEXT: v_readlane_b32 s14, v61, 24
+; SI-NEXT: v_readlane_b32 s15, v61, 25
; SI-NEXT: v_readlane_b32 s16, v61, 22
+; SI-NEXT: v_readlane_b32 s17, v61, 23
; SI-NEXT: s_mov_b32 s96, s94
; SI-NEXT: v_readlane_b32 s94, v61, 20
-; SI-NEXT: v_readlane_b32 s18, v61, 18
-; SI-NEXT: v_readlane_b32 s20, v61, 16
-; SI-NEXT: v_readlane_b32 s22, v61, 14
-; SI-NEXT: v_readlane_b32 s24, v61, 12
-; SI-NEXT: v_readlane_b32 s40, v61, 10
-; SI-NEXT: v_readlane_b32 s42, v61, 8
-; SI-NEXT: v_readlane_b32 s44, v61, 6
-; SI-NEXT: v_readlane_b32 s46, v61, 4
-; SI-NEXT: v_readlane_b32 s56, v61, 2
-; SI-NEXT: v_readlane_b32 vcc_lo, v61, 0
-; SI-NEXT: v_mov_b32_e32 v55, s97
-; SI-NEXT: v_readlane_b32 s27, v61, 41
-; SI-NEXT: v_readlane_b32 s29, v61, 39
-; SI-NEXT: v_readlane_b32 s7, v61, 37
-; SI-NEXT: v_readlane_b32 s59, v61, 35
-; SI-NEXT: v_readlane_b32 s61, v61, 33
-; SI-NEXT: v_readlane_b32 s9, v61, 31
-; SI-NEXT: v_readlane_b32 s11, v61, 29
-; SI-NEXT: v_readlane_b32 s13, v61, 27
-; SI-NEXT: v_readlane_b32 s15, v61, 25
-; SI-NEXT: v_readlane_b32 s17, v61, 23
; SI-NEXT: v_readlane_b32 s95, v61, 21
+; SI-NEXT: v_readlane_b32 s18, v61, 18
; SI-NEXT: v_readlane_b32 s19, v61, 19
+; SI-NEXT: v_readlane_b32 s20, v61, 16
; SI-NEXT: v_readlane_b32 s21, v61, 17
+; SI-NEXT: v_readlane_b32 s22, v61, 14
; SI-NEXT: v_readlane_b32 s23, v61, 15
+; SI-NEXT: v_readlane_b32 s24, v61, 12
; SI-NEXT: v_readlane_b32 s25, v61, 13
+; SI-NEXT: v_readlane_b32 s40, v61, 10
; SI-NEXT: v_readlane_b32 s41, v61, 11
+; SI-NEXT: v_readlane_b32 s42, v61, 8
; SI-NEXT: v_readlane_b32 s43, v61, 9
+; SI-NEXT: v_readlane_b32 s44, v61, 6
; SI-NEXT: v_readlane_b32 s45, v61, 7
+; SI-NEXT: v_readlane_b32 s46, v61, 4
; SI-NEXT: v_readlane_b32 s47, v61, 5
+; SI-NEXT: v_readlane_b32 s56, v61, 2
; SI-NEXT: v_readlane_b32 s57, v61, 3
+; SI-NEXT: v_readlane_b32 vcc_lo, v61, 0
; SI-NEXT: v_readlane_b32 vcc_hi, v61, 1
; SI-NEXT: .LBB37_5: ; %end
; SI-NEXT: s_waitcnt vmcnt(14)
@@ -82666,8 +82666,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s18
; SI-NEXT: v_readlane_b32 s18, v23, 0
; SI-NEXT: s_and_b32 s16, s16, 0xff
-; SI-NEXT: v_readlane_b32 s19, v23, 1
; SI-NEXT: s_lshl_b32 s18, s18, 8
+; SI-NEXT: v_readlane_b32 s19, v23, 1
; SI-NEXT: s_or_b32 s16, s16, s18
; SI-NEXT: v_readlane_b32 s18, v23, 2
; SI-NEXT: v_readlane_b32 s19, v23, 3
@@ -82700,8 +82700,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s16
; SI-NEXT: v_readlane_b32 s16, v23, 6
; SI-NEXT: s_and_b32 s14, s14, 0xff
-; SI-NEXT: v_readlane_b32 s17, v23, 7
; SI-NEXT: s_lshl_b32 s16, s16, 8
+; SI-NEXT: v_readlane_b32 s17, v23, 7
; SI-NEXT: s_or_b32 s14, s14, s16
; SI-NEXT: v_readlane_b32 s16, v23, 8
; SI-NEXT: v_readlane_b32 s17, v23, 9
@@ -82734,8 +82734,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s14
; SI-NEXT: v_readlane_b32 s14, v23, 12
; SI-NEXT: s_and_b32 s12, s12, 0xff
-; SI-NEXT: v_readlane_b32 s15, v23, 13
; SI-NEXT: s_lshl_b32 s14, s14, 8
+; SI-NEXT: v_readlane_b32 s15, v23, 13
; SI-NEXT: s_or_b32 s12, s12, s14
; SI-NEXT: v_readlane_b32 s14, v23, 14
; SI-NEXT: v_readlane_b32 s15, v23, 15
@@ -82768,8 +82768,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s12
; SI-NEXT: v_readlane_b32 s12, v23, 18
; SI-NEXT: s_and_b32 s10, s10, 0xff
-; SI-NEXT: v_readlane_b32 s13, v23, 19
; SI-NEXT: s_lshl_b32 s12, s12, 8
+; SI-NEXT: v_readlane_b32 s13, v23, 19
; SI-NEXT: s_or_b32 s10, s10, s12
; SI-NEXT: v_readlane_b32 s12, v23, 20
; SI-NEXT: v_readlane_b32 s13, v23, 21
@@ -82802,8 +82802,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s10
; SI-NEXT: v_readlane_b32 s10, v23, 24
; SI-NEXT: s_and_b32 s8, s8, 0xff
-; SI-NEXT: v_readlane_b32 s11, v23, 25
; SI-NEXT: s_lshl_b32 s10, s10, 8
+; SI-NEXT: v_readlane_b32 s11, v23, 25
; SI-NEXT: s_or_b32 s8, s8, s10
; SI-NEXT: v_readlane_b32 s10, v23, 26
; SI-NEXT: v_readlane_b32 s11, v23, 27
@@ -82836,8 +82836,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s8
; SI-NEXT: v_readlane_b32 s8, v23, 30
; SI-NEXT: s_and_b32 s6, s6, 0xff
-; SI-NEXT: v_readlane_b32 s9, v23, 31
; SI-NEXT: s_lshl_b32 s8, s8, 8
+; SI-NEXT: v_readlane_b32 s9, v23, 31
; SI-NEXT: s_or_b32 s6, s6, s8
; SI-NEXT: v_readlane_b32 s8, v23, 32
; SI-NEXT: v_readlane_b32 s9, v23, 33
@@ -82955,8 +82955,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr54
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v23, s54, 0
; SI-NEXT: ; implicit-def: $sgpr26
@@ -82965,172 +82963,174 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr54
-; SI-NEXT: ; implicit-def: $sgpr53
-; SI-NEXT: ; implicit-def: $sgpr52
-; SI-NEXT: ; implicit-def: $sgpr51
-; SI-NEXT: ; implicit-def: $sgpr50
-; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr49
-; SI-NEXT: ; implicit-def: $sgpr38
-; SI-NEXT: ; implicit-def: $sgpr36
-; SI-NEXT: ; implicit-def: $sgpr34
-; SI-NEXT: ; implicit-def: $sgpr30
-; SI-NEXT: ; implicit-def: $sgpr94
-; SI-NEXT: ; implicit-def: $sgpr92
-; SI-NEXT: ; implicit-def: $sgpr90
-; SI-NEXT: ; implicit-def: $sgpr88
-; SI-NEXT: ; implicit-def: $sgpr78
-; SI-NEXT: ; implicit-def: $sgpr76
-; SI-NEXT: ; implicit-def: $sgpr74
-; SI-NEXT: ; implicit-def: $sgpr72
-; SI-NEXT: ; implicit-def: $sgpr62
-; SI-NEXT: ; implicit-def: $sgpr60
-; SI-NEXT: ; implicit-def: $sgpr58
-; SI-NEXT: ; implicit-def: $sgpr28
-; SI-NEXT: ; implicit-def: $sgpr98
-; SI-NEXT: ; implicit-def: $sgpr96
-; SI-NEXT: ; implicit-def: $sgpr86
-; SI-NEXT: ; implicit-def: $sgpr84
-; SI-NEXT: ; implicit-def: $sgpr82
-; SI-NEXT: ; implicit-def: $sgpr80
-; SI-NEXT: ; implicit-def: $sgpr70
-; SI-NEXT: ; implicit-def: $sgpr68
-; SI-NEXT: ; implicit-def: $sgpr66
-; SI-NEXT: ; implicit-def: $sgpr64
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v23, s54, 2
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v23, s55, 3
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; kill: killed $sgpr26
-; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: v_writelane_b32 v23, s54, 4
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s55, 5
-; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 5
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s54, 6
-; SI-NEXT: v_writelane_b32 v23, s55, 7
; SI-NEXT: ; implicit-def: $sgpr54
-; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s54, 6
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 7
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s54, 8
-; SI-NEXT: v_writelane_b32 v23, s55, 9
-; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: v_writelane_b32 v23, s54, 8
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 9
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s54, 10
-; SI-NEXT: v_writelane_b32 v23, s55, 11
-; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: v_writelane_b32 v23, s54, 10
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 11
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s54, 12
-; SI-NEXT: v_writelane_b32 v23, s55, 13
-; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: v_writelane_b32 v23, s54, 12
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 13
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s54, 14
-; SI-NEXT: v_writelane_b32 v23, s55, 15
-; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: v_writelane_b32 v23, s54, 14
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 15
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s54, 16
-; SI-NEXT: v_writelane_b32 v23, s55, 17
-; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: v_writelane_b32 v23, s54, 16
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 17
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s54, 18
-; SI-NEXT: v_writelane_b32 v23, s55, 19
-; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: v_writelane_b32 v23, s54, 18
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 19
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s54, 20
-; SI-NEXT: v_writelane_b32 v23, s55, 21
-; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: v_writelane_b32 v23, s54, 20
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 21
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s54, 22
-; SI-NEXT: v_writelane_b32 v23, s55, 23
-; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: v_writelane_b32 v23, s54, 22
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 23
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s54, 24
-; SI-NEXT: v_writelane_b32 v23, s55, 25
-; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: v_writelane_b32 v23, s54, 24
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 25
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s54, 26
-; SI-NEXT: v_writelane_b32 v23, s55, 27
-; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: v_writelane_b32 v23, s54, 26
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 27
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s54, 28
-; SI-NEXT: v_writelane_b32 v23, s55, 29
-; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: v_writelane_b32 v23, s54, 28
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 29
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s54, 30
-; SI-NEXT: v_writelane_b32 v23, s55, 31
-; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: v_writelane_b32 v23, s54, 30
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 31
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s54, 32
-; SI-NEXT: v_writelane_b32 v23, s55, 33
-; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: v_writelane_b32 v23, s54, 32
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 33
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s54, 34
-; SI-NEXT: v_writelane_b32 v23, s55, 35
-; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: v_writelane_b32 v23, s54, 34
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 35
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v23, s54, 36
-; SI-NEXT: v_writelane_b32 v23, s55, 37
+; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: v_writelane_b32 v23, s54, 36
+; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v23, s55, 37
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr54
; SI-NEXT: v_writelane_b32 v23, s54, 38
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr53
+; SI-NEXT: ; implicit-def: $sgpr52
+; SI-NEXT: ; implicit-def: $sgpr51
+; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: ; implicit-def: $sgpr48
+; SI-NEXT: ; implicit-def: $sgpr49
+; SI-NEXT: ; implicit-def: $sgpr38
+; SI-NEXT: ; implicit-def: $sgpr36
+; SI-NEXT: ; implicit-def: $sgpr34
+; SI-NEXT: ; implicit-def: $sgpr30
+; SI-NEXT: ; implicit-def: $sgpr94
+; SI-NEXT: ; implicit-def: $sgpr92
+; SI-NEXT: ; implicit-def: $sgpr90
+; SI-NEXT: ; implicit-def: $sgpr88
+; SI-NEXT: ; implicit-def: $sgpr78
+; SI-NEXT: ; implicit-def: $sgpr76
+; SI-NEXT: ; implicit-def: $sgpr74
+; SI-NEXT: ; implicit-def: $sgpr72
+; SI-NEXT: ; implicit-def: $sgpr62
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: ; implicit-def: $sgpr58
+; SI-NEXT: ; implicit-def: $sgpr28
+; SI-NEXT: ; implicit-def: $sgpr98
+; SI-NEXT: ; implicit-def: $sgpr96
+; SI-NEXT: ; implicit-def: $sgpr86
+; SI-NEXT: ; implicit-def: $sgpr84
+; SI-NEXT: ; implicit-def: $sgpr82
+; SI-NEXT: ; implicit-def: $sgpr80
+; SI-NEXT: ; implicit-def: $sgpr70
+; SI-NEXT: ; implicit-def: $sgpr68
+; SI-NEXT: ; implicit-def: $sgpr66
+; SI-NEXT: ; implicit-def: $sgpr64
; SI-NEXT: v_writelane_b32 v23, s55, 39
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: s_branch .LBB57_2
;
; VI-LABEL: bitcast_v16i64_to_v128i8_scalar:
@@ -85081,10 +85081,9 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX9-NEXT: .LBB57_4:
; GFX9-NEXT: ; implicit-def: $sgpr27
; GFX9-NEXT: ; kill: killed $sgpr27
-; GFX9-NEXT: ; implicit-def: $sgpr83
-; GFX9-NEXT: ; implicit-def: $sgpr82
; GFX9-NEXT: ; implicit-def: $sgpr27
; GFX9-NEXT: ; kill: killed $sgpr27
+; GFX9-NEXT: ; implicit-def: $sgpr82
; GFX9-NEXT: v_writelane_b32 v22, s82, 0
; GFX9-NEXT: ; implicit-def: $sgpr27
; GFX9-NEXT: ; kill: killed $sgpr27
@@ -85117,6 +85116,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX9-NEXT: ; implicit-def: $sgpr85
; GFX9-NEXT: ; implicit-def: $sgpr84
; GFX9-NEXT: ; implicit-def: $sgpr81
+; GFX9-NEXT: ; implicit-def: $sgpr83
; GFX9-NEXT: ; implicit-def: $sgpr36
; GFX9-NEXT: ; implicit-def: $sgpr34
; GFX9-NEXT: ; implicit-def: $sgpr30
@@ -85469,12 +85469,10 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24
; GFX11-NEXT: s_branch .LBB57_3
; GFX11-NEXT: .LBB57_2:
-; GFX11-NEXT: ; implicit-def: $vcc_hi
; GFX11-NEXT: ; implicit-def: $vcc_lo
-; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: ; kill: killed $sgpr42
-; GFX11-NEXT: s_mov_b32 s101, -1
; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 0
+; GFX11-NEXT: ; implicit-def: $vcc_hi
+; GFX11-NEXT: ; implicit-def: $vcc_lo
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
@@ -85485,7 +85483,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 1
-; GFX11-NEXT: ; implicit-def: $vcc_lo
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
@@ -85497,6 +85494,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 2
+; GFX11-NEXT: ; implicit-def: $vcc_lo
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
@@ -85508,7 +85506,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 3
-; GFX11-NEXT: ; implicit-def: $vcc_lo
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
@@ -85520,6 +85517,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 4
+; GFX11-NEXT: ; implicit-def: $vcc_lo
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
@@ -85528,10 +85526,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
+; GFX11-NEXT: s_mov_b32 s101, -1
+; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5
+; GFX11-NEXT: ; kill: killed $sgpr42
+; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5
-; GFX11-NEXT: ; implicit-def: $vcc_lo
; GFX11-NEXT: ; implicit-def: $sgpr45
; GFX11-NEXT: ; implicit-def: $sgpr44
; GFX11-NEXT: ; implicit-def: $sgpr30
@@ -85595,17 +85595,17 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
; GFX11-NEXT: ; implicit-def: $sgpr88
; GFX11-NEXT: ; implicit-def: $sgpr76
; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 6
-; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7
; GFX11-NEXT: ; implicit-def: $vcc_lo
+; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7
; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 8
-; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9
; GFX11-NEXT: ; implicit-def: $vcc_lo
+; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9
; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 10
-; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11
; GFX11-NEXT: ; implicit-def: $vcc_lo
+; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11
; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 12
-; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13
; GFX11-NEXT: ; implicit-def: $vcc_lo
+; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13
; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 14
; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 15
; GFX11-NEXT: .LBB57_3: ; %Flow
@@ -117783,8 +117783,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v62, s48, 0
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
@@ -117792,37 +117790,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr38
-; SI-NEXT: ; implicit-def: $sgpr36
-; SI-NEXT: ; implicit-def: $sgpr34
-; SI-NEXT: ; implicit-def: $sgpr30
-; SI-NEXT: ; implicit-def: $sgpr94
-; SI-NEXT: ; implicit-def: $sgpr92
-; SI-NEXT: ; implicit-def: $sgpr90
-; SI-NEXT: ; implicit-def: $sgpr88
-; SI-NEXT: ; implicit-def: $sgpr78
-; SI-NEXT: ; implicit-def: $sgpr76
-; SI-NEXT: ; implicit-def: $sgpr74
-; SI-NEXT: ; implicit-def: $sgpr72
-; SI-NEXT: ; implicit-def: $sgpr62
-; SI-NEXT: ; implicit-def: $sgpr60
-; SI-NEXT: ; implicit-def: $sgpr58
-; SI-NEXT: ; implicit-def: $sgpr28
-; SI-NEXT: ; implicit-def: $sgpr98
-; SI-NEXT: ; implicit-def: $sgpr96
-; SI-NEXT: ; implicit-def: $sgpr86
-; SI-NEXT: ; implicit-def: $sgpr84
-; SI-NEXT: ; implicit-def: $sgpr82
-; SI-NEXT: ; implicit-def: $sgpr80
-; SI-NEXT: ; implicit-def: $sgpr70
-; SI-NEXT: ; implicit-def: $sgpr68
-; SI-NEXT: ; implicit-def: $sgpr66
-; SI-NEXT: ; implicit-def: $sgpr64
-; SI-NEXT: ; implicit-def: $sgpr54
-; SI-NEXT: ; implicit-def: $sgpr52
-; SI-NEXT: ; implicit-def: $sgpr50
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v62, s48, 2
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
@@ -117830,8 +117797,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v62, s48, 4
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
@@ -117839,8 +117804,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v62, s48, 6
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
@@ -117848,8 +117811,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v62, s48, 8
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
@@ -117857,8 +117818,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v62, s48, 10
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
@@ -117866,8 +117825,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v62, s48, 12
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
@@ -117875,8 +117832,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v62, s48, 14
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
@@ -117884,8 +117839,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v62, s48, 16
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
@@ -117893,8 +117846,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v62, s48, 18
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
@@ -117902,8 +117853,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v62, s48, 20
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
@@ -117911,8 +117860,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v62, s48, 22
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
@@ -117920,44 +117867,97 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v62, s48, 24
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v62, s49, 25
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; implicit-def: $sgpr48
; SI-NEXT: ; kill: killed $sgpr26
-; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr48
; SI-NEXT: v_writelane_b32 v62, s48, 26
-; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: v_writelane_b32 v62, s49, 27
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr48
+; SI-NEXT: v_writelane_b32 v62, s48, 28
+; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v62, s49, 29
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr48
+; SI-NEXT: v_writelane_b32 v62, s48, 30
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v62, s48, 28
-; SI-NEXT: v_writelane_b32 v62, s49, 29
+; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v62, s49, 31
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr48
+; SI-NEXT: v_writelane_b32 v62, s48, 32
+; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: v_writelane_b32 v62, s49, 33
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v62, s48, 30
-; SI-NEXT: v_writelane_b32 v62, s49, 31
+; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr38
+; SI-NEXT: ; implicit-def: $sgpr36
+; SI-NEXT: ; implicit-def: $sgpr34
+; SI-NEXT: ; implicit-def: $sgpr30
+; SI-NEXT: ; implicit-def: $sgpr94
+; SI-NEXT: ; implicit-def: $sgpr92
+; SI-NEXT: ; implicit-def: $sgpr90
+; SI-NEXT: ; implicit-def: $sgpr88
+; SI-NEXT: ; implicit-def: $sgpr78
+; SI-NEXT: ; implicit-def: $sgpr76
+; SI-NEXT: ; implicit-def: $sgpr74
+; SI-NEXT: ; implicit-def: $sgpr72
+; SI-NEXT: ; implicit-def: $sgpr62
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: ; implicit-def: $sgpr58
+; SI-NEXT: ; implicit-def: $sgpr28
+; SI-NEXT: ; implicit-def: $sgpr98
+; SI-NEXT: ; implicit-def: $sgpr96
+; SI-NEXT: ; implicit-def: $sgpr86
+; SI-NEXT: ; implicit-def: $sgpr84
+; SI-NEXT: ; implicit-def: $sgpr82
+; SI-NEXT: ; implicit-def: $sgpr80
+; SI-NEXT: ; implicit-def: $sgpr70
+; SI-NEXT: ; implicit-def: $sgpr68
+; SI-NEXT: ; implicit-def: $sgpr66
+; SI-NEXT: ; implicit-def: $sgpr64
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: ; implicit-def: $sgpr52
+; SI-NEXT: ; implicit-def: $sgpr50
; SI-NEXT: ; implicit-def: $sgpr48
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr26
+; SI-NEXT: ; kill: killed $sgpr26
+; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: v_writelane_b32 v62, s48, 32
-; SI-NEXT: v_writelane_b32 v62, s49, 33
; SI-NEXT: ; kill: killed $sgpr26
; SI-NEXT: ; implicit-def: $sgpr26
-; SI-NEXT: ; implicit-def: $sgpr48
; SI-NEXT: s_branch .LBB73_2
; SI-NEXT: .LBB73_4:
; SI-NEXT: v_mov_b32_e32 v1, s38
@@ -118013,9 +118013,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
; SI-NEXT: v_mov_b32_e32 v16, s26
; SI-NEXT: v_readlane_b32 s26, v62, 30
; SI-NEXT: v_readlane_b32 s27, v62, 31
+; SI-NEXT: v_mov_b32_e32 v3, s8
; SI-NEXT: v_mov_b32_e32 v51, s26
; SI-NEXT: v_readlane_b32 s26, v62, 32
-; SI-NEXT: v_mov_b32_e32 v3, s8
; SI-NEXT: v_readlane_b32 s27, v62, 33
; SI-NEXT: v_mov_b32_e32 v38, s72
; SI-NEXT: v_mov_b32_e32 v49, s62
@@ -167832,8 +167832,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: s_lshr_b32 s13, s4, 16
; SI-NEXT: s_mov_b32 s5, s13
; SI-NEXT: v_writelane_b32 v61, s4, 26
-; SI-NEXT: v_writelane_b32 v61, s5, 27
; SI-NEXT: v_readfirstlane_b32 s4, v46
+; SI-NEXT: v_writelane_b32 v61, s5, 27
; SI-NEXT: s_lshr_b32 s5, s4, 16
; SI-NEXT: v_readfirstlane_b32 s4, v10
; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16
@@ -168027,191 +168027,92 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: s_branch .LBB91_3
; SI-NEXT: .LBB91_2:
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: s_waitcnt expcnt(3)
-; SI-NEXT: v_mov_b32_e32 v54, v59
; SI-NEXT: v_writelane_b32 v62, s4, 0
; SI-NEXT: v_writelane_b32 v62, s5, 1
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v59, v51
; SI-NEXT: v_writelane_b32 v62, s4, 2
; SI-NEXT: v_writelane_b32 v62, s5, 3
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v31, v46
; SI-NEXT: v_writelane_b32 v62, s4, 4
; SI-NEXT: v_writelane_b32 v62, s5, 5
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v34, v22
; SI-NEXT: v_writelane_b32 v62, s4, 6
; SI-NEXT: v_writelane_b32 v62, s5, 7
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v22, v24
; SI-NEXT: v_writelane_b32 v62, s4, 8
; SI-NEXT: v_writelane_b32 v62, s5, 9
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v7, v37
; SI-NEXT: v_writelane_b32 v62, s4, 10
; SI-NEXT: v_writelane_b32 v62, s5, 11
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; SI-NEXT: v_writelane_b32 v62, s4, 12
; SI-NEXT: v_writelane_b32 v62, s5, 13
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: v_writelane_b32 v62, s4, 14
; SI-NEXT: v_writelane_b32 v62, s5, 15
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v54, v59
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: ; implicit-def: $sgpr21
-; SI-NEXT: ; implicit-def: $sgpr20
-; SI-NEXT: ; implicit-def: $sgpr89
-; SI-NEXT: ; implicit-def: $sgpr88
-; SI-NEXT: v_mov_b32_e32 v44, v1
; SI-NEXT: v_writelane_b32 v62, s4, 16
+; SI-NEXT: v_mov_b32_e32 v59, v51
+; SI-NEXT: v_mov_b32_e32 v31, v46
+; SI-NEXT: v_mov_b32_e32 v34, v22
+; SI-NEXT: v_mov_b32_e32 v22, v24
+; SI-NEXT: v_mov_b32_e32 v7, v37
+; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; SI-NEXT: v_writelane_b32 v62, s5, 17
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: v_mov_b32_e32 v52, v17
; SI-NEXT: v_writelane_b32 v62, s4, 18
; SI-NEXT: v_writelane_b32 v62, s5, 19
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v43, v20
; SI-NEXT: v_writelane_b32 v62, s4, 20
; SI-NEXT: v_writelane_b32 v62, s5, 21
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v42, v32
; SI-NEXT: v_writelane_b32 v62, s4, 22
; SI-NEXT: v_writelane_b32 v62, s5, 23
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v41, v5
; SI-NEXT: v_writelane_b32 v62, s4, 24
; SI-NEXT: v_writelane_b32 v62, s5, 25
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: s_mov_b64 vcc, -1
; SI-NEXT: v_writelane_b32 v62, s4, 26
; SI-NEXT: v_writelane_b32 v62, s5, 27
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v60, v35
; SI-NEXT: v_writelane_b32 v62, s4, 28
; SI-NEXT: v_writelane_b32 v62, s5, 29
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: v_mov_b32_e32 v35, v6
; SI-NEXT: v_writelane_b32 v62, s4, 30
; SI-NEXT: v_writelane_b32 v62, s5, 31
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v32, v4
; SI-NEXT: v_writelane_b32 v62, s4, 32
; SI-NEXT: v_writelane_b32 v62, s5, 33
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v30, v12
; SI-NEXT: v_writelane_b32 v62, s4, 34
; SI-NEXT: v_writelane_b32 v62, s5, 35
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v19, v39
; SI-NEXT: v_writelane_b32 v62, s4, 36
; SI-NEXT: v_writelane_b32 v62, s5, 37
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: v_mov_b32_e32 v39, v25
; SI-NEXT: v_writelane_b32 v62, s4, 38
; SI-NEXT: v_writelane_b32 v62, s5, 39
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v12, v29
; SI-NEXT: v_writelane_b32 v62, s4, 40
; SI-NEXT: v_writelane_b32 v62, s5, 41
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v20, v2
; SI-NEXT: v_writelane_b32 v62, s4, 42
; SI-NEXT: v_writelane_b32 v62, s5, 43
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v6, v55
; SI-NEXT: v_writelane_b32 v62, s4, 44
; SI-NEXT: v_writelane_b32 v62, s5, 45
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v17, v8
; SI-NEXT: v_writelane_b32 v62, s4, 46
; SI-NEXT: v_writelane_b32 v62, s5, 47
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v29, v33
; SI-NEXT: v_writelane_b32 v62, s4, 48
; SI-NEXT: v_writelane_b32 v62, s5, 49
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: ; implicit-def: $sgpr8
-; SI-NEXT: ; implicit-def: $sgpr96
-; SI-NEXT: ; implicit-def: $sgpr78
-; SI-NEXT: ; implicit-def: $sgpr7
-; SI-NEXT: ; implicit-def: $vgpr48
-; SI-NEXT: ; implicit-def: $sgpr86
-; SI-NEXT: ; implicit-def: $sgpr84
-; SI-NEXT: ; implicit-def: $sgpr61
-; SI-NEXT: ; implicit-def: $sgpr65
-; SI-NEXT: ; implicit-def: $vgpr16
-; SI-NEXT: ; implicit-def: $sgpr80
-; SI-NEXT: ; implicit-def: $sgpr70
-; SI-NEXT: ; implicit-def: $sgpr72
-; SI-NEXT: ; implicit-def: $sgpr69
-; SI-NEXT: ; implicit-def: $vgpr15
-; SI-NEXT: ; implicit-def: $sgpr66
-; SI-NEXT: ; implicit-def: $sgpr38
-; SI-NEXT: ; implicit-def: $sgpr75
-; SI-NEXT: ; implicit-def: $sgpr91
-; SI-NEXT: ; implicit-def: $vgpr13
-; SI-NEXT: ; implicit-def: $sgpr52
-; SI-NEXT: ; implicit-def: $sgpr98
-; SI-NEXT: ; implicit-def: $sgpr58
-; SI-NEXT: ; implicit-def: $sgpr37
-; SI-NEXT: ; implicit-def: $vgpr11
-; SI-NEXT: ; implicit-def: $sgpr30
-; SI-NEXT: ; implicit-def: $sgpr82
-; SI-NEXT: ; implicit-def: $sgpr43
-; SI-NEXT: ; implicit-def: $vgpr47
-; SI-NEXT: ; implicit-def: $sgpr44
-; SI-NEXT: ; implicit-def: $vgpr3
-; SI-NEXT: ; implicit-def: $sgpr27
-; SI-NEXT: ; implicit-def: $vgpr18
-; SI-NEXT: ; implicit-def: $sgpr28
-; SI-NEXT: ; implicit-def: $sgpr74
-; SI-NEXT: ; implicit-def: $vgpr9
-; SI-NEXT: ; implicit-def: $sgpr22
-; SI-NEXT: ; implicit-def: $vgpr14
-; SI-NEXT: ; implicit-def: $sgpr18
-; SI-NEXT: ; implicit-def: $vgpr56
-; SI-NEXT: ; implicit-def: $sgpr15
-; SI-NEXT: ; implicit-def: $sgpr12
-; SI-NEXT: ; implicit-def: $sgpr6
-; SI-NEXT: ; implicit-def: $sgpr50
-; SI-NEXT: ; implicit-def: $sgpr94
-; SI-NEXT: ; implicit-def: $sgpr57
-; SI-NEXT: ; implicit-def: $sgpr92
-; SI-NEXT: ; implicit-def: $sgpr64
-; SI-NEXT: ; implicit-def: $sgpr79
-; SI-NEXT: ; implicit-def: $sgpr76
-; SI-NEXT: ; implicit-def: $sgpr90
-; SI-NEXT: ; implicit-def: $sgpr73
-; SI-NEXT: ; implicit-def: $sgpr62
-; SI-NEXT: ; implicit-def: $sgpr54
-; SI-NEXT: ; implicit-def: $sgpr59
-; SI-NEXT: ; implicit-def: $sgpr46
-; SI-NEXT: ; implicit-def: $sgpr68
-; SI-NEXT: ; implicit-def: $sgpr45
-; SI-NEXT: ; implicit-def: $sgpr40
-; SI-NEXT: ; implicit-def: $sgpr56
-; SI-NEXT: ; implicit-def: $sgpr29
-; SI-NEXT: ; implicit-def: $sgpr24
-; SI-NEXT: ; implicit-def: $sgpr34
-; SI-NEXT: ; implicit-def: $sgpr23
-; SI-NEXT: ; implicit-def: $sgpr16
-; SI-NEXT: ; implicit-def: $sgpr36
-; SI-NEXT: ; implicit-def: $sgpr19
-; SI-NEXT: ; implicit-def: $sgpr10
-; SI-NEXT: ; implicit-def: $sgpr14
-; SI-NEXT: ; implicit-def: $sgpr13
-; SI-NEXT: ; implicit-def: $vgpr4
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr1
-; SI-NEXT: ; kill: killed $vgpr1
-; SI-NEXT: ; implicit-def: $vgpr55
; SI-NEXT: v_writelane_b32 v62, s4, 50
; SI-NEXT: v_writelane_b32 v62, s5, 51
; SI-NEXT: ; implicit-def: $sgpr4
@@ -168231,7 +168132,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_writelane_b32 v62, s5, 61
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: v_writelane_b32 v62, s4, 62
-; SI-NEXT: v_writelane_b32 v62, s5, 63
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: v_writelane_b32 v61, s4, 0
; SI-NEXT: v_writelane_b32 v61, s5, 1
@@ -168271,18 +168171,118 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: v_writelane_b32 v61, s4, 24
; SI-NEXT: v_writelane_b32 v61, s5, 25
-; SI-NEXT: ; implicit-def: $sgpr5
; SI-NEXT: v_writelane_b32 v61, s4, 26
+; SI-NEXT: v_writelane_b32 v62, s5, 63
+; SI-NEXT: ; implicit-def: $sgpr5
; SI-NEXT: v_writelane_b32 v61, s5, 27
+; SI-NEXT: ; implicit-def: $sgpr20
; SI-NEXT: v_writelane_b32 v61, s20, 28
+; SI-NEXT: ; implicit-def: $sgpr21
; SI-NEXT: v_writelane_b32 v61, s21, 29
; SI-NEXT: ; implicit-def: $sgpr20
-; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: v_writelane_b32 v61, s20, 30
; SI-NEXT: v_writelane_b32 v61, s21, 31
+; SI-NEXT: v_mov_b32_e32 v44, v1
+; SI-NEXT: ; implicit-def: $sgpr88
; SI-NEXT: v_writelane_b32 v61, s88, 32
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: v_mov_b32_e32 v52, v17
+; SI-NEXT: v_mov_b32_e32 v43, v20
+; SI-NEXT: v_mov_b32_e32 v42, v32
+; SI-NEXT: v_mov_b32_e32 v41, v5
+; SI-NEXT: s_mov_b64 vcc, -1
+; SI-NEXT: ; implicit-def: $sgpr89
; SI-NEXT: v_writelane_b32 v61, s89, 33
+; SI-NEXT: v_mov_b32_e32 v60, v35
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: v_mov_b32_e32 v35, v6
+; SI-NEXT: v_mov_b32_e32 v32, v4
+; SI-NEXT: v_mov_b32_e32 v30, v12
+; SI-NEXT: v_mov_b32_e32 v19, v39
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: v_mov_b32_e32 v39, v25
+; SI-NEXT: v_mov_b32_e32 v12, v29
+; SI-NEXT: v_mov_b32_e32 v20, v2
+; SI-NEXT: v_mov_b32_e32 v6, v55
+; SI-NEXT: v_mov_b32_e32 v17, v8
+; SI-NEXT: v_mov_b32_e32 v29, v33
+; SI-NEXT: ; implicit-def: $sgpr8
+; SI-NEXT: ; implicit-def: $sgpr96
+; SI-NEXT: ; implicit-def: $sgpr78
+; SI-NEXT: ; implicit-def: $sgpr7
+; SI-NEXT: ; implicit-def: $vgpr48
+; SI-NEXT: ; implicit-def: $sgpr86
+; SI-NEXT: ; implicit-def: $sgpr84
+; SI-NEXT: ; implicit-def: $sgpr61
+; SI-NEXT: ; implicit-def: $sgpr65
+; SI-NEXT: ; implicit-def: $vgpr16
+; SI-NEXT: ; implicit-def: $sgpr80
+; SI-NEXT: ; implicit-def: $sgpr70
+; SI-NEXT: ; implicit-def: $sgpr72
+; SI-NEXT: ; implicit-def: $sgpr69
+; SI-NEXT: ; implicit-def: $vgpr15
+; SI-NEXT: ; implicit-def: $sgpr66
+; SI-NEXT: ; implicit-def: $sgpr38
+; SI-NEXT: ; implicit-def: $sgpr75
+; SI-NEXT: ; implicit-def: $sgpr91
+; SI-NEXT: ; implicit-def: $vgpr13
+; SI-NEXT: ; implicit-def: $sgpr52
+; SI-NEXT: ; implicit-def: $sgpr98
+; SI-NEXT: ; implicit-def: $sgpr58
+; SI-NEXT: ; implicit-def: $sgpr37
+; SI-NEXT: ; implicit-def: $vgpr11
+; SI-NEXT: ; implicit-def: $sgpr30
+; SI-NEXT: ; implicit-def: $sgpr82
+; SI-NEXT: ; implicit-def: $sgpr43
+; SI-NEXT: ; implicit-def: $vgpr47
+; SI-NEXT: ; implicit-def: $sgpr44
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $sgpr27
+; SI-NEXT: ; implicit-def: $vgpr18
+; SI-NEXT: ; implicit-def: $sgpr28
+; SI-NEXT: ; implicit-def: $sgpr74
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $sgpr22
+; SI-NEXT: ; implicit-def: $vgpr14
+; SI-NEXT: ; implicit-def: $sgpr18
+; SI-NEXT: ; implicit-def: $vgpr56
+; SI-NEXT: ; implicit-def: $sgpr15
+; SI-NEXT: ; implicit-def: $sgpr12
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: ; implicit-def: $sgpr94
+; SI-NEXT: ; implicit-def: $sgpr57
+; SI-NEXT: ; implicit-def: $sgpr92
+; SI-NEXT: ; implicit-def: $sgpr64
+; SI-NEXT: ; implicit-def: $sgpr79
+; SI-NEXT: ; implicit-def: $sgpr76
+; SI-NEXT: ; implicit-def: $sgpr90
+; SI-NEXT: ; implicit-def: $sgpr73
+; SI-NEXT: ; implicit-def: $sgpr62
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: ; implicit-def: $sgpr59
+; SI-NEXT: ; implicit-def: $sgpr46
+; SI-NEXT: ; implicit-def: $sgpr68
+; SI-NEXT: ; implicit-def: $sgpr45
+; SI-NEXT: ; implicit-def: $sgpr40
+; SI-NEXT: ; implicit-def: $sgpr56
+; SI-NEXT: ; implicit-def: $sgpr29
+; SI-NEXT: ; implicit-def: $sgpr24
+; SI-NEXT: ; implicit-def: $sgpr34
+; SI-NEXT: ; implicit-def: $sgpr23
+; SI-NEXT: ; implicit-def: $sgpr16
+; SI-NEXT: ; implicit-def: $sgpr36
+; SI-NEXT: ; implicit-def: $sgpr19
+; SI-NEXT: ; implicit-def: $sgpr10
+; SI-NEXT: ; implicit-def: $sgpr14
+; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: ; implicit-def: $sgpr88
+; SI-NEXT: ; implicit-def: $sgpr13
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; kill: killed $vgpr1
+; SI-NEXT: ; implicit-def: $vgpr55
; SI-NEXT: .LBB91_3: ; %Flow
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -168307,30 +168307,29 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v53
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38
; SI-NEXT: v_readfirstlane_b32 s6, v9
; SI-NEXT: v_readfirstlane_b32 s8, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40
-; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38
; SI-NEXT: s_lshr_b32 s9, s6, 16
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT: v_readfirstlane_b32 s6, v8
+; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 16
-; SI-NEXT: s_lshr_b32 s9, s6, 16
+; SI-NEXT: v_readfirstlane_b32 s6, v8
; SI-NEXT: v_readfirstlane_b32 s8, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v54
-; SI-NEXT: s_mov_b32 s7, s9
+; SI-NEXT: s_lshr_b32 s9, s6, 16
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58
-; SI-NEXT: v_writelane_b32 v61, s6, 26
; SI-NEXT: s_lshr_b64 s[20:21], s[8:9], 16
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; SI-NEXT: v_readfirstlane_b32 s8, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49
-; SI-NEXT: v_writelane_b32 v61, s7, 27
+; SI-NEXT: v_writelane_b32 v61, s6, 26
; SI-NEXT: v_readfirstlane_b32 s6, v5
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50
+; SI-NEXT: s_mov_b32 s7, s9
; SI-NEXT: s_lshr_b32 s9, s6, 16
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; SI-NEXT: v_readfirstlane_b32 s18, v3
@@ -168351,6 +168350,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v35
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT: v_writelane_b32 v61, s7, 27
; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16
; SI-NEXT: s_mov_b32 s17, s26
; SI-NEXT: s_mov_b32 s11, s20
@@ -168801,8 +168801,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: .LBB91_5: ; %end
; SI-NEXT: s_and_b32 s5, s8, 0xff
; SI-NEXT: v_readlane_b32 s8, v62, 0
-; SI-NEXT: v_readlane_b32 s9, v62, 1
; SI-NEXT: s_lshl_b32 s8, s8, 8
+; SI-NEXT: v_readlane_b32 s9, v62, 1
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: v_readlane_b32 s8, v62, 2
; SI-NEXT: v_readlane_b32 s9, v62, 3
@@ -168829,10 +168829,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5
; SI-NEXT: v_or_b32_e32 v1, s5, v1
; SI-NEXT: s_and_b32 s5, s86, 0xff
-; SI-NEXT: v_readlane_b32 s9, v62, 7
; SI-NEXT: s_lshl_b32 s8, s8, 8
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0
+; SI-NEXT: v_readlane_b32 s9, v62, 7
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: v_readlane_b32 s8, v62, 8
; SI-NEXT: v_readlane_b32 s9, v62, 9
@@ -168860,8 +168860,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_readlane_b32 s8, v62, 12
; SI-NEXT: v_or_b32_e32 v1, s5, v1
; SI-NEXT: s_and_b32 s5, s80, 0xff
-; SI-NEXT: v_readlane_b32 s9, v62, 13
; SI-NEXT: s_lshl_b32 s8, s8, 8
+; SI-NEXT: v_readlane_b32 s9, v62, 13
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: v_readlane_b32 s8, v62, 14
; SI-NEXT: v_readlane_b32 s9, v62, 15
@@ -168890,9 +168890,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_readlane_b32 s8, v62, 18
; SI-NEXT: v_or_b32_e32 v1, s5, v1
; SI-NEXT: s_and_b32 s5, s66, 0xff
-; SI-NEXT: v_readlane_b32 s9, v62, 19
; SI-NEXT: s_lshl_b32 s8, s8, 8
-; SI-NEXT: v_readlane_b32 s61, v62, 17
+; SI-NEXT: v_readlane_b32 s9, v62, 19
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: v_readlane_b32 s8, v62, 20
; SI-NEXT: v_readlane_b32 s9, v62, 21
@@ -168921,9 +168920,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_readlane_b32 s8, v62, 24
; SI-NEXT: v_or_b32_e32 v1, s5, v1
; SI-NEXT: s_and_b32 s5, s52, 0xff
-; SI-NEXT: v_readlane_b32 s9, v62, 25
; SI-NEXT: s_lshl_b32 s8, s8, 8
-; SI-NEXT: v_readlane_b32 s61, v62, 23
+; SI-NEXT: v_readlane_b32 s9, v62, 25
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: v_readlane_b32 s8, v62, 26
; SI-NEXT: v_readlane_b32 s9, v62, 27
@@ -168952,9 +168950,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_readlane_b32 s8, v62, 30
; SI-NEXT: v_or_b32_e32 v1, s5, v1
; SI-NEXT: s_and_b32 s5, s30, 0xff
-; SI-NEXT: v_readlane_b32 s9, v62, 31
; SI-NEXT: s_lshl_b32 s8, s8, 8
-; SI-NEXT: v_readlane_b32 s61, v62, 29
+; SI-NEXT: v_readlane_b32 s9, v62, 31
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: v_readlane_b32 s8, v62, 32
; SI-NEXT: v_readlane_b32 s9, v62, 33
@@ -168983,8 +168980,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_readlane_b32 s8, v62, 36
; SI-NEXT: v_or_b32_e32 v1, s5, v1
; SI-NEXT: s_and_b32 s5, s50, 0xff
-; SI-NEXT: v_readlane_b32 s9, v62, 37
; SI-NEXT: s_lshl_b32 s8, s8, 8
+; SI-NEXT: v_readlane_b32 s9, v62, 37
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: v_readlane_b32 s8, v62, 38
; SI-NEXT: v_readlane_b32 s9, v62, 39
@@ -169013,9 +169010,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_readlane_b32 s8, v62, 42
; SI-NEXT: v_or_b32_e32 v1, s5, v1
; SI-NEXT: s_and_b32 s5, s92, 0xff
-; SI-NEXT: v_readlane_b32 s9, v62, 43
; SI-NEXT: s_lshl_b32 s8, s8, 8
-; SI-NEXT: v_readlane_b32 s43, v62, 41
+; SI-NEXT: v_readlane_b32 s9, v62, 43
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: v_readlane_b32 s8, v62, 44
; SI-NEXT: v_readlane_b32 s9, v62, 45
@@ -169044,8 +169040,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_readlane_b32 s8, v62, 48
; SI-NEXT: v_or_b32_e32 v1, s5, v1
; SI-NEXT: s_and_b32 s5, s76, 0xff
-; SI-NEXT: v_readlane_b32 s9, v62, 49
; SI-NEXT: s_lshl_b32 s8, s8, 8
+; SI-NEXT: v_readlane_b32 s9, v62, 49
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: v_readlane_b32 s8, v62, 50
; SI-NEXT: v_readlane_b32 s9, v62, 51
@@ -169074,9 +169070,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_readlane_b32 s8, v62, 54
; SI-NEXT: v_or_b32_e32 v1, s5, v1
; SI-NEXT: s_and_b32 s5, s62, 0xff
-; SI-NEXT: v_readlane_b32 s9, v62, 55
; SI-NEXT: s_lshl_b32 s8, s8, 8
-; SI-NEXT: v_readlane_b32 s27, v62, 53
+; SI-NEXT: v_readlane_b32 s9, v62, 55
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: v_readlane_b32 s8, v62, 56
; SI-NEXT: v_readlane_b32 s9, v62, 57
@@ -169105,9 +169100,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_readlane_b32 s8, v62, 60
; SI-NEXT: v_or_b32_e32 v1, s5, v1
; SI-NEXT: s_and_b32 s5, s46, 0xff
-; SI-NEXT: v_readlane_b32 s9, v62, 61
; SI-NEXT: s_lshl_b32 s8, s8, 8
-; SI-NEXT: v_readlane_b32 s27, v62, 59
+; SI-NEXT: v_readlane_b32 s9, v62, 61
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: v_readlane_b32 s8, v62, 62
; SI-NEXT: v_readlane_b32 s9, v62, 63
@@ -169136,9 +169130,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_readlane_b32 s8, v61, 2
; SI-NEXT: v_or_b32_e32 v1, s5, v1
; SI-NEXT: s_and_b32 s5, s40, 0xff
-; SI-NEXT: v_readlane_b32 s9, v61, 3
; SI-NEXT: s_lshl_b32 s8, s8, 8
-; SI-NEXT: v_readlane_b32 s27, v61, 1
+; SI-NEXT: v_readlane_b32 s9, v61, 3
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: v_readlane_b32 s8, v61, 4
; SI-NEXT: v_readlane_b32 s9, v61, 5
@@ -169167,8 +169160,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_readlane_b32 s8, v61, 8
; SI-NEXT: v_or_b32_e32 v1, s5, v1
; SI-NEXT: s_and_b32 s5, s24, 0xff
-; SI-NEXT: v_readlane_b32 s9, v61, 9
; SI-NEXT: s_lshl_b32 s8, s8, 8
+; SI-NEXT: v_readlane_b32 s9, v61, 9
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: v_readlane_b32 s8, v61, 10
; SI-NEXT: v_readlane_b32 s9, v61, 11
@@ -169197,8 +169190,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_readlane_b32 s8, v61, 14
; SI-NEXT: v_or_b32_e32 v1, s5, v1
; SI-NEXT: s_and_b32 s5, s16, 0xff
-; SI-NEXT: v_readlane_b32 s9, v61, 15
; SI-NEXT: s_lshl_b32 s8, s8, 8
+; SI-NEXT: v_readlane_b32 s9, v61, 15
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: v_readlane_b32 s8, v61, 16
; SI-NEXT: v_readlane_b32 s9, v61, 17
@@ -169223,14 +169216,24 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: s_and_b32 s8, s19, 0xff
; SI-NEXT: s_lshl_b32 s8, s8, 16
; SI-NEXT: s_and_b32 s5, s5, 0xffff
+; SI-NEXT: v_readlane_b32 s9, v61, 21
+; SI-NEXT: v_readlane_b32 s9, v61, 23
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0
; SI-NEXT: s_and_b32 s4, s4, 0xff
+; SI-NEXT: v_readlane_b32 s61, v62, 17
+; SI-NEXT: v_readlane_b32 s27, v62, 53
+; SI-NEXT: v_readlane_b32 s61, v62, 23
+; SI-NEXT: v_readlane_b32 s27, v62, 59
+; SI-NEXT: v_readlane_b32 s61, v62, 29
+; SI-NEXT: v_readlane_b32 s43, v62, 41
+; SI-NEXT: v_readlane_b32 s27, v61, 1
; SI-NEXT: v_readlane_b32 s61, v62, 35
; SI-NEXT: v_readlane_b32 s43, v62, 47
; SI-NEXT: v_readlane_b32 s27, v61, 7
; SI-NEXT: v_readlane_b32 s21, v61, 13
; SI-NEXT: v_readlane_b32 s17, v61, 19
+; SI-NEXT: v_readlane_b32 s11, v61, 25
; SI-NEXT: v_readlane_b32 s99, v63, 35
; SI-NEXT: v_readlane_b32 s98, v63, 34
; SI-NEXT: v_readlane_b32 s97, v63, 33
@@ -169273,11 +169276,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: v_readlane_b32 s8, v61, 20
; SI-NEXT: v_or_b32_e32 v1, s5, v1
; SI-NEXT: s_and_b32 s5, s10, 0xff
-; SI-NEXT: v_readlane_b32 s9, v61, 21
; SI-NEXT: s_lshl_b32 s8, s8, 8
; SI-NEXT: s_or_b32 s5, s5, s8
; SI-NEXT: v_readlane_b32 s8, v61, 22
-; SI-NEXT: v_readlane_b32 s9, v61, 23
; SI-NEXT: s_and_b32 s8, s8, 0xff
; SI-NEXT: v_readlane_b32 s10, v61, 24
; SI-NEXT: s_lshl_b32 s8, s8, 16
@@ -169301,16 +169302,16 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: s_and_b32 s5, s5, 0xffff
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0
-; SI-NEXT: v_readlane_b32 s11, v61, 25
+; SI-NEXT: v_readlane_b32 s9, v61, 29
+; SI-NEXT: v_readlane_b32 s9, v61, 31
+; SI-NEXT: v_readlane_b32 s9, v61, 33
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1
; SI-NEXT: v_or_b32_e32 v1, s8, v1
; SI-NEXT: v_readlane_b32 s8, v61, 28
-; SI-NEXT: v_readlane_b32 s9, v61, 29
; SI-NEXT: v_or_b32_e32 v1, s5, v1
; SI-NEXT: s_lshl_b32 s5, s8, 8
; SI-NEXT: v_readlane_b32 s8, v61, 30
-; SI-NEXT: v_readlane_b32 s9, v61, 31
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: s_and_b32 s5, s8, 0xff
; SI-NEXT: v_readlane_b32 s8, v61, 32
@@ -169332,7 +169333,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; SI-NEXT: s_lshl_b32 s5, s5, 16
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT: v_readlane_b32 s9, v61, 33
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1
; SI-NEXT: v_or_b32_e32 v1, s5, v1
@@ -170923,7 +170923,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: s_and_b32 s5, s6, 0xff
; VI-NEXT: v_readlane_b32 s6, v22, 49
-; VI-NEXT: v_readlane_b32 s9, v22, 5
; VI-NEXT: s_lshl_b32 s6, s6, 8
; VI-NEXT: s_or_b32 s5, s5, s6
; VI-NEXT: v_readlane_b32 s6, v22, 48
@@ -170980,6 +170979,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: s_lshl_b32 s5, s5, 16
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v0
; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: v_readlane_b32 s9, v22, 5
; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -171030,42 +171030,41 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: .LBB91_4:
; VI-NEXT: ; implicit-def: $sgpr46
; VI-NEXT: ; kill: killed $sgpr46
-; VI-NEXT: ; implicit-def: $sgpr61
; VI-NEXT: ; implicit-def: $sgpr60
-; VI-NEXT: ; implicit-def: $sgpr63
-; VI-NEXT: ; implicit-def: $sgpr62
-; VI-NEXT: ; implicit-def: $sgpr73
-; VI-NEXT: ; implicit-def: $sgpr72
-; VI-NEXT: ; implicit-def: $sgpr75
-; VI-NEXT: ; implicit-def: $sgpr74
-; VI-NEXT: ; implicit-def: $sgpr77
-; VI-NEXT: ; implicit-def: $sgpr76
-; VI-NEXT: ; implicit-def: $sgpr46
-; VI-NEXT: ; kill: killed $sgpr46
; VI-NEXT: v_writelane_b32 v22, s60, 0
; VI-NEXT: ; implicit-def: $sgpr46
; VI-NEXT: ; kill: killed $sgpr46
+; VI-NEXT: ; implicit-def: $sgpr61
; VI-NEXT: v_writelane_b32 v22, s61, 1
; VI-NEXT: ; implicit-def: $sgpr46
; VI-NEXT: ; kill: killed $sgpr46
+; VI-NEXT: ; implicit-def: $sgpr62
; VI-NEXT: v_writelane_b32 v22, s62, 2
; VI-NEXT: ; implicit-def: $sgpr46
; VI-NEXT: ; kill: killed $sgpr46
+; VI-NEXT: ; implicit-def: $sgpr63
; VI-NEXT: v_writelane_b32 v22, s63, 3
; VI-NEXT: ; implicit-def: $sgpr46
; VI-NEXT: ; kill: killed $sgpr46
+; VI-NEXT: ; implicit-def: $sgpr72
; VI-NEXT: v_writelane_b32 v22, s72, 4
; VI-NEXT: ; implicit-def: $sgpr46
; VI-NEXT: ; kill: killed $sgpr46
+; VI-NEXT: ; implicit-def: $sgpr73
; VI-NEXT: v_writelane_b32 v22, s73, 5
; VI-NEXT: ; implicit-def: $sgpr46
; VI-NEXT: ; kill: killed $sgpr46
+; VI-NEXT: ; implicit-def: $sgpr74
; VI-NEXT: v_writelane_b32 v22, s74, 6
; VI-NEXT: ; implicit-def: $sgpr46
; VI-NEXT: ; kill: killed $sgpr46
+; VI-NEXT: ; implicit-def: $sgpr75
; VI-NEXT: v_writelane_b32 v22, s75, 7
; VI-NEXT: ; implicit-def: $sgpr46
; VI-NEXT: ; kill: killed $sgpr46
+; VI-NEXT: ; implicit-def: $sgpr46
+; VI-NEXT: ; kill: killed $sgpr46
+; VI-NEXT: ; implicit-def: $sgpr76
; VI-NEXT: v_writelane_b32 v22, s76, 8
; VI-NEXT: ; implicit-def: $sgpr46
; VI-NEXT: ; kill: killed $sgpr46
@@ -171079,6 +171078,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; VI-NEXT: ; implicit-def: $sgpr47
; VI-NEXT: ; implicit-def: $sgpr65
; VI-NEXT: ; implicit-def: $sgpr53
+; VI-NEXT: ; implicit-def: $sgpr77
; VI-NEXT: ; implicit-def: $sgpr59
; VI-NEXT: ; implicit-def: $sgpr66
; VI-NEXT: ; implicit-def: $sgpr78
@@ -173106,11 +173106,11 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-NEXT: .LBB91_2:
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
-; GFX11-NEXT: ; implicit-def: $sgpr43
; GFX11-NEXT: ; implicit-def: $sgpr46
; GFX11-NEXT: ; kill: killed $sgpr46
-; GFX11-NEXT: ; implicit-def: $vcc_hi
+; GFX11-NEXT: ; implicit-def: $sgpr43
; GFX11-NEXT: ; implicit-def: $vcc_lo
+; GFX11-NEXT: ; implicit-def: $vcc_hi
; GFX11-NEXT: s_mov_b32 s104, -1
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
@@ -173213,7 +173213,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: v_writelane_b32 v20, s42, 0
-; GFX11-NEXT: v_writelane_b32 v20, s43, 1
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
@@ -173223,14 +173222,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: v_writelane_b32 v20, s46, 2
+; GFX11-NEXT: v_writelane_b32 v20, s43, 1
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
-; GFX11-NEXT: v_writelane_b32 v20, s47, 3
+; GFX11-NEXT: v_writelane_b32 v20, s46, 2
; GFX11-NEXT: ; implicit-def: $sgpr46
+; GFX11-NEXT: v_writelane_b32 v20, s47, 3
; GFX11-NEXT: v_writelane_b32 v20, vcc_lo, 4
-; GFX11-NEXT: v_writelane_b32 v20, vcc_hi, 5
; GFX11-NEXT: ; implicit-def: $vcc_lo
+; GFX11-NEXT: v_writelane_b32 v20, vcc_hi, 5
; GFX11-NEXT: v_writelane_b32 v20, vcc_lo, 6
; GFX11-NEXT: v_writelane_b32 v20, vcc_hi, 7
; GFX11-NEXT: .LBB91_3: ; %Flow
@@ -174070,82 +174070,82 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-NEXT: s_lshr_b64 s[74:75], s[74:75], 24
; GFX11-NEXT: s_lshr_b32 s75, s42, 8
; GFX11-NEXT: v_writelane_b32 v20, s58, 0
+; GFX11-NEXT: s_lshr_b32 s58, s63, 24
; GFX11-NEXT: s_lshr_b32 s26, s26, 16
; GFX11-NEXT: s_lshr_b32 s65, s73, 24
; GFX11-NEXT: s_pack_ll_b32_b16 s90, s26, s90
-; GFX11-NEXT: s_lshr_b32 s82, s73, 8
; GFX11-NEXT: v_writelane_b32 v20, s59, 1
-; GFX11-NEXT: s_lshr_b32 s58, s63, 24
; GFX11-NEXT: s_lshr_b32 s59, s63, 8
; GFX11-NEXT: s_lshr_b64 s[62:63], s[62:63], 24
; GFX11-NEXT: s_lshr_b32 s63, s93, 24
-; GFX11-NEXT: s_lshr_b32 s84, s72, 16
+; GFX11-NEXT: s_lshr_b32 s82, s73, 8
; GFX11-NEXT: v_writelane_b32 v20, s63, 21
; GFX11-NEXT: s_lshr_b32 s63, s93, 8
+; GFX11-NEXT: s_lshr_b32 s84, s72, 16
; GFX11-NEXT: s_lshr_b32 s51, s72, 8
; GFX11-NEXT: s_lshr_b64 s[72:73], s[72:73], 24
-; GFX11-NEXT: s_lshr_b32 s86, s77, 24
; GFX11-NEXT: v_writelane_b32 v20, s63, 22
; GFX11-NEXT: s_lshr_b32 s63, s92, 16
+; GFX11-NEXT: s_lshr_b32 s86, s77, 24
; GFX11-NEXT: s_lshr_b32 s87, s77, 8
; GFX11-NEXT: s_lshr_b32 s52, s76, 16
-; GFX11-NEXT: s_lshr_b32 s100, s76, 8
; GFX11-NEXT: v_writelane_b32 v20, s63, 23
; GFX11-NEXT: s_lshr_b32 s63, s92, 8
+; GFX11-NEXT: s_lshr_b32 s100, s76, 8
; GFX11-NEXT: s_lshr_b64 s[76:77], s[76:77], 24
; GFX11-NEXT: s_lshr_b32 s101, s89, 8
-; GFX11-NEXT: s_lshr_b32 s98, s79, 24
; GFX11-NEXT: v_writelane_b32 v20, s63, 24
; GFX11-NEXT: s_lshr_b32 s63, s95, 24
+; GFX11-NEXT: s_lshr_b32 s98, s79, 24
; GFX11-NEXT: s_lshr_b32 s99, s79, 8
; GFX11-NEXT: s_lshr_b32 s53, s78, 16
-; GFX11-NEXT: s_lshr_b32 s97, s78, 8
; GFX11-NEXT: v_writelane_b32 v20, s63, 25
; GFX11-NEXT: s_lshr_b32 s63, s95, 8
+; GFX11-NEXT: s_lshr_b32 s97, s78, 8
; GFX11-NEXT: s_lshr_b64 s[78:79], s[78:79], 24
; GFX11-NEXT: s_lshr_b64 s[92:93], s[92:93], 24
-; GFX11-NEXT: s_lshr_b32 s102, s94, 16
; GFX11-NEXT: v_writelane_b32 v20, s63, 26
; GFX11-NEXT: s_lshr_b32 s63, s43, 24
+; GFX11-NEXT: s_lshr_b32 s102, s94, 16
; GFX11-NEXT: s_lshr_b32 s103, s94, 8
; GFX11-NEXT: s_lshr_b64 s[94:95], s[94:95], 24
-; GFX11-NEXT: s_lshr_b32 s73, s91, 24
; GFX11-NEXT: v_writelane_b32 v20, s63, 27
; GFX11-NEXT: s_lshr_b32 s63, s43, 8
+; GFX11-NEXT: s_lshr_b32 s73, s91, 24
; GFX11-NEXT: s_lshr_b32 s77, s91, 8
; GFX11-NEXT: s_lshr_b32 s83, s90, 8
-; GFX11-NEXT: s_lshr_b32 s66, s37, 24
; GFX11-NEXT: v_writelane_b32 v20, s63, 28
; GFX11-NEXT: s_lshr_b32 s63, s42, 16
; GFX11-NEXT: s_lshr_b64 s[42:43], s[42:43], 24
+; GFX11-NEXT: s_lshr_b32 s66, s37, 24
; GFX11-NEXT: s_lshr_b32 s67, s37, 8
-; GFX11-NEXT: s_lshr_b32 s68, s36, 16
; GFX11-NEXT: v_writelane_b32 v20, s42, 6
+; GFX11-NEXT: s_lshr_b32 s68, s36, 16
; GFX11-NEXT: s_lshr_b32 s49, s36, 8
; GFX11-NEXT: s_lshr_b32 s69, s35, 24
; GFX11-NEXT: s_lshr_b32 s70, s35, 8
-; GFX11-NEXT: s_lshr_b32 s64, s34, 16
; GFX11-NEXT: v_writelane_b32 v20, s43, 7
; GFX11-NEXT: s_lshr_b64 s[42:43], s[44:45], 24
+; GFX11-NEXT: s_lshr_b32 s64, s34, 16
; GFX11-NEXT: s_lshr_b32 s80, s34, 8
; GFX11-NEXT: s_lshr_b32 s79, s45, 24
-; GFX11-NEXT: s_lshr_b32 s93, s45, 8
; GFX11-NEXT: v_writelane_b32 v20, s42, 4
+; GFX11-NEXT: s_lshr_b32 s42, s89, 24
+; GFX11-NEXT: s_lshr_b32 s93, s45, 8
; GFX11-NEXT: s_lshr_b32 s95, s44, 16
; GFX11-NEXT: s_lshr_b32 vcc_hi, s44, 8
-; GFX11-NEXT: s_lshr_b32 s34, s47, 24
-; GFX11-NEXT: s_lshr_b32 s55, s47, 8
; GFX11-NEXT: v_writelane_b32 v20, s43, 5
; GFX11-NEXT: s_lshr_b32 s43, s88, 16
-; GFX11-NEXT: s_lshr_b32 s42, s89, 24
+; GFX11-NEXT: s_lshr_b32 s34, s47, 24
+; GFX11-NEXT: s_lshr_b32 s55, s47, 8
; GFX11-NEXT: s_lshr_b32 s35, s46, 16
-; GFX11-NEXT: s_lshr_b32 s36, s46, 8
; GFX11-NEXT: v_writelane_b32 v20, s43, 29
; GFX11-NEXT: s_lshr_b32 s43, s88, 8
; GFX11-NEXT: s_lshr_b64 s[88:89], s[88:89], 24
; GFX11-NEXT: s_lshr_b32 s89, s90, 16
; GFX11-NEXT: s_lshr_b64 s[90:91], s[90:91], 24
; GFX11-NEXT: v_writelane_b32 v20, s43, 30
+; GFX11-NEXT: s_lshr_b32 s36, s46, 8
; GFX11-NEXT: s_lshr_b64 s[46:47], s[46:47], 24
; GFX11-NEXT: s_lshr_b32 s37, s57, 24
; GFX11-NEXT: s_lshr_b32 s38, s57, 8
@@ -174258,9 +174258,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-NEXT: s_and_b32 s16, s16, 0xff
; GFX11-NEXT: s_lshl_b32 s19, s73, 8
; GFX11-NEXT: s_or_b32 s16, s16, s17
-; GFX11-NEXT: v_readlane_b32 s96, v19, 0
+; GFX11-NEXT: v_readlane_b32 s17, v20, 1
; GFX11-NEXT: s_lshl_b32 s16, s16, 16
-; GFX11-NEXT: v_readlane_b32 s81, v18, 25
+; GFX11-NEXT: s_lshl_b32 s17, s70, 8
; GFX11-NEXT: s_or_b32 s3, s3, s16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3
@@ -174272,14 +174272,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-NEXT: v_readlane_b32 s16, v20, 0
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: s_and_b32 s1, s21, 0xff
-; GFX11-NEXT: v_readlane_b32 s17, v20, 1
-; GFX11-NEXT: s_or_b32 s1, s1, s2
-; GFX11-NEXT: v_readlane_b32 s2, v20, 18
-; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-NEXT: s_lshl_b32 s17, s70, 8
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-NEXT: s_or_b32 s1, s1, s2
+; GFX11-NEXT: v_readlane_b32 s2, v20, 18
+; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT: v_readlane_b32 s96, v19, 0
+; GFX11-NEXT: v_readlane_b32 s81, v18, 25
; GFX11-NEXT: v_readlane_b32 s70, v18, 22
; GFX11-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-NEXT: v_readlane_b32 s69, v18, 21
@@ -174308,22 +174308,21 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-NEXT: s_or_b32 s17, s17, s18
; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-NEXT: s_lshl_b32 s16, s17, 16
-; GFX11-NEXT: s_and_b32 s18, s71, 0xff
+; GFX11-NEXT: v_readlane_b32 s17, v20, 3
; GFX11-NEXT: s_or_b32 s3, s3, s16
; GFX11-NEXT: v_readlane_b32 s16, v20, 2
; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3
; GFX11-NEXT: s_and_b32 s2, s68, 0xff
-; GFX11-NEXT: v_readlane_b32 s17, v20, 3
+; GFX11-NEXT: s_lshl_b32 s17, s66, 8
; GFX11-NEXT: s_lshl_b32 s3, s16, 8
; GFX11-NEXT: v_readlane_b32 s16, v20, 20
; GFX11-NEXT: s_or_b32 s1, s2, s3
; GFX11-NEXT: s_and_b32 s2, s25, 0xff
; GFX11-NEXT: s_lshl_b32 s3, s67, 8
-; GFX11-NEXT: s_lshl_b32 s17, s66, 8
+; GFX11-NEXT: s_lshl_b32 s1, s1, 16
; GFX11-NEXT: s_and_b32 s16, s16, 0xff
; GFX11-NEXT: s_or_b32 s2, s2, s3
; GFX11-NEXT: s_or_b32 s3, s16, s17
-; GFX11-NEXT: s_lshl_b32 s1, s1, 16
; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-NEXT: s_or_b32 s0, s0, s1
@@ -174336,9 +174335,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
; GFX11-NEXT: s_or_b32 s3, s16, s17
; GFX11-NEXT: s_and_b32 s16, s27, 0xff
; GFX11-NEXT: s_lshl_b32 s17, s77, 8
-; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT: s_and_b32 s18, s71, 0xff
; GFX11-NEXT: s_or_b32 s16, s16, s17
; GFX11-NEXT: s_or_b32 s17, s18, s19
+; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
; GFX11-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-NEXT: s_and_b32 s16, s16, 0xffff
; GFX11-NEXT: s_lshl_b32 s17, s17, 16
@@ -191731,8 +191731,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v62, s5, 3
; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 8
; SI-NEXT: v_writelane_b32 v62, s4, 0
-; SI-NEXT: v_writelane_b32 v62, s5, 1
; SI-NEXT: v_readfirstlane_b32 s4, v38
+; SI-NEXT: v_writelane_b32 v62, s5, 1
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: v_readfirstlane_b32 s5, v37
; SI-NEXT: s_or_b32 s42, s5, s4
@@ -191766,8 +191766,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v62, s5, 15
; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8
; SI-NEXT: v_writelane_b32 v62, s4, 12
-; SI-NEXT: v_writelane_b32 v62, s5, 13
; SI-NEXT: v_readfirstlane_b32 s4, v35
+; SI-NEXT: v_writelane_b32 v62, s5, 13
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: v_readfirstlane_b32 s5, v20
; SI-NEXT: s_or_b32 s28, s5, s4
@@ -191847,8 +191847,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v62, s5, 33
; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 8
; SI-NEXT: v_writelane_b32 v62, s4, 30
-; SI-NEXT: v_writelane_b32 v62, s5, 31
; SI-NEXT: v_readfirstlane_b32 s4, v7
+; SI-NEXT: v_writelane_b32 v62, s5, 31
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: v_readfirstlane_b32 s5, v29
; SI-NEXT: s_or_b32 s22, s5, s4
@@ -191864,8 +191864,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_writelane_b32 v62, s5, 39
; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 8
; SI-NEXT: v_writelane_b32 v62, s4, 36
-; SI-NEXT: v_writelane_b32 v62, s5, 37
; SI-NEXT: v_readfirstlane_b32 s4, v58
+; SI-NEXT: v_writelane_b32 v62, s5, 37
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: v_readfirstlane_b32 s5, v13
; SI-NEXT: s_or_b32 s20, s5, s4
@@ -192030,97 +192030,104 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: ; implicit-def: $sgpr80
-; SI-NEXT: v_mov_b32_e32 v51, v42
; SI-NEXT: v_writelane_b32 v62, s4, 0
; SI-NEXT: v_writelane_b32 v62, s5, 1
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v41, v21
; SI-NEXT: v_writelane_b32 v62, s4, 2
; SI-NEXT: v_writelane_b32 v62, s5, 3
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v21, v24
; SI-NEXT: v_writelane_b32 v62, s4, 4
; SI-NEXT: v_writelane_b32 v62, s5, 5
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v40, v34
; SI-NEXT: v_writelane_b32 v62, s4, 6
; SI-NEXT: v_writelane_b32 v62, s5, 7
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v34, v61
; SI-NEXT: v_writelane_b32 v62, s4, 8
; SI-NEXT: v_writelane_b32 v62, s5, 9
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v13, v12
; SI-NEXT: v_writelane_b32 v62, s4, 10
; SI-NEXT: v_writelane_b32 v62, s5, 11
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v12, v48
; SI-NEXT: v_writelane_b32 v62, s4, 12
; SI-NEXT: v_writelane_b32 v62, s5, 13
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v57, v30
; SI-NEXT: v_writelane_b32 v62, s4, 14
; SI-NEXT: v_writelane_b32 v62, s5, 15
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v58, v11
; SI-NEXT: v_writelane_b32 v62, s4, 16
; SI-NEXT: v_writelane_b32 v62, s5, 17
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v56, v47
; SI-NEXT: v_writelane_b32 v62, s4, 18
; SI-NEXT: v_writelane_b32 v62, s5, 19
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v46, v33
; SI-NEXT: v_writelane_b32 v62, s4, 20
; SI-NEXT: v_writelane_b32 v62, s5, 21
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v30, v32
; SI-NEXT: v_writelane_b32 v62, s4, 22
; SI-NEXT: v_writelane_b32 v62, s5, 23
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v32, v31
; SI-NEXT: v_writelane_b32 v62, s4, 24
; SI-NEXT: v_writelane_b32 v62, s5, 25
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v31, v10
; SI-NEXT: v_writelane_b32 v62, s4, 26
; SI-NEXT: v_writelane_b32 v62, s5, 27
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_mov_b32_e32 v54, v9
; SI-NEXT: v_writelane_b32 v62, s4, 28
; SI-NEXT: v_writelane_b32 v62, s5, 29
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v55, v4
; SI-NEXT: v_writelane_b32 v62, s4, 30
; SI-NEXT: v_writelane_b32 v62, s5, 31
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v22, v2
; SI-NEXT: v_writelane_b32 v62, s4, 32
; SI-NEXT: v_writelane_b32 v62, s5, 33
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v2, v1
; SI-NEXT: v_writelane_b32 v62, s4, 34
; SI-NEXT: v_writelane_b32 v62, s5, 35
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v17, v43
; SI-NEXT: v_writelane_b32 v62, s4, 36
; SI-NEXT: v_writelane_b32 v62, s5, 37
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: s_mov_b64 vcc, -1
; SI-NEXT: v_writelane_b32 v62, s4, 38
; SI-NEXT: v_writelane_b32 v62, s5, 39
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v25, v59
; SI-NEXT: v_writelane_b32 v62, s4, 40
; SI-NEXT: v_writelane_b32 v62, s5, 41
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_mov_b32_e32 v1, v52
; SI-NEXT: v_writelane_b32 v62, s4, 42
; SI-NEXT: v_writelane_b32 v62, s5, 43
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: v_writelane_b32 v62, s4, 44
+; SI-NEXT: v_writelane_b32 v62, s5, 45
+; SI-NEXT: ; implicit-def: $sgpr80
+; SI-NEXT: v_writelane_b32 v62, s80, 46
+; SI-NEXT: v_writelane_b32 v62, s81, 47
+; SI-NEXT: ; implicit-def: $sgpr80
+; SI-NEXT: v_writelane_b32 v62, s80, 48
+; SI-NEXT: v_mov_b32_e32 v51, v42
+; SI-NEXT: v_mov_b32_e32 v41, v21
+; SI-NEXT: v_mov_b32_e32 v21, v24
+; SI-NEXT: v_mov_b32_e32 v40, v34
+; SI-NEXT: v_mov_b32_e32 v34, v61
+; SI-NEXT: v_mov_b32_e32 v13, v12
+; SI-NEXT: v_mov_b32_e32 v12, v48
+; SI-NEXT: v_mov_b32_e32 v57, v30
+; SI-NEXT: v_mov_b32_e32 v58, v11
+; SI-NEXT: v_mov_b32_e32 v56, v47
+; SI-NEXT: v_mov_b32_e32 v46, v33
+; SI-NEXT: v_mov_b32_e32 v30, v32
+; SI-NEXT: v_mov_b32_e32 v32, v31
+; SI-NEXT: v_mov_b32_e32 v31, v10
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v54, v9
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v55, v4
+; SI-NEXT: v_mov_b32_e32 v22, v2
+; SI-NEXT: v_mov_b32_e32 v2, v1
+; SI-NEXT: v_mov_b32_e32 v17, v43
+; SI-NEXT: s_mov_b64 vcc, -1
+; SI-NEXT: v_writelane_b32 v62, s81, 49
+; SI-NEXT: v_mov_b32_e32 v25, v59
+; SI-NEXT: v_mov_b32_e32 v1, v52
; SI-NEXT: ; implicit-def: $sgpr44
; SI-NEXT: ; implicit-def: $sgpr71
; SI-NEXT: ; implicit-def: $vgpr24
@@ -192190,14 +192197,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: ; implicit-def: $sgpr86
; SI-NEXT: ; implicit-def: $sgpr84
; SI-NEXT: ; implicit-def: $sgpr82
-; SI-NEXT: v_writelane_b32 v62, s4, 44
-; SI-NEXT: v_writelane_b32 v62, s5, 45
-; SI-NEXT: v_writelane_b32 v62, s80, 46
-; SI-NEXT: v_writelane_b32 v62, s81, 47
-; SI-NEXT: ; implicit-def: $sgpr80
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v62, s80, 48
-; SI-NEXT: v_writelane_b32 v62, s81, 49
; SI-NEXT: ; implicit-def: $sgpr80
; SI-NEXT: .LBB95_3: ; %Flow
; SI-NEXT: v_mov_b32_e32 v14, v17
@@ -192731,11 +192731,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: s_lshr_b32 s71, s45, 8
; SI-NEXT: .LBB95_5: ; %end
; SI-NEXT: v_readlane_b32 vcc_lo, v62, 0
-; SI-NEXT: v_readlane_b32 vcc_hi, v62, 1
; SI-NEXT: s_lshl_b32 s47, vcc_lo, 8
-; SI-NEXT: v_readlane_b32 vcc_lo, v62, 2
; SI-NEXT: s_and_b32 s44, s44, 0xff
-; SI-NEXT: v_readlane_b32 vcc_hi, v62, 3
+; SI-NEXT: v_readlane_b32 vcc_lo, v62, 2
; SI-NEXT: s_or_b32 s44, s44, s47
; SI-NEXT: s_and_b32 s47, vcc_lo, 0xff
; SI-NEXT: v_readlane_b32 vcc_lo, v62, 4
@@ -192756,10 +192754,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v13, v21, v13
; SI-NEXT: v_or_b32_e32 v13, s44, v13
; SI-NEXT: v_readlane_b32 s44, v62, 6
-; SI-NEXT: v_readlane_b32 s45, v62, 7
; SI-NEXT: s_lshl_b32 s44, s44, 8
; SI-NEXT: s_and_b32 s42, s42, 0xff
-; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5
+; SI-NEXT: v_readlane_b32 s45, v62, 7
; SI-NEXT: s_or_b32 s42, s42, s44
; SI-NEXT: v_readlane_b32 s44, v62, 8
; SI-NEXT: v_readlane_b32 s45, v62, 9
@@ -192781,9 +192778,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v12, v23, v12
; SI-NEXT: v_or_b32_e32 v12, s42, v12
; SI-NEXT: v_readlane_b32 s42, v62, 12
-; SI-NEXT: v_readlane_b32 s43, v62, 13
; SI-NEXT: s_lshl_b32 s42, s42, 8
; SI-NEXT: s_and_b32 s40, s40, 0xff
+; SI-NEXT: v_readlane_b32 s43, v62, 13
; SI-NEXT: s_or_b32 s40, s40, s42
; SI-NEXT: v_readlane_b32 s42, v62, 14
; SI-NEXT: v_readlane_b32 s43, v62, 15
@@ -192805,9 +192802,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v11, v24, v11
; SI-NEXT: v_or_b32_e32 v11, s40, v11
; SI-NEXT: v_readlane_b32 s40, v62, 18
-; SI-NEXT: v_readlane_b32 s41, v62, 19
; SI-NEXT: s_lshl_b32 s40, s40, 8
; SI-NEXT: s_and_b32 s28, s28, 0xff
+; SI-NEXT: v_readlane_b32 s41, v62, 19
; SI-NEXT: s_or_b32 s28, s28, s40
; SI-NEXT: v_readlane_b32 s40, v62, 20
; SI-NEXT: v_readlane_b32 s41, v62, 21
@@ -192829,9 +192826,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v25, v26, v25
; SI-NEXT: v_or_b32_e32 v25, s28, v25
; SI-NEXT: v_readlane_b32 s28, v62, 24
-; SI-NEXT: v_readlane_b32 s29, v62, 25
; SI-NEXT: s_lshl_b32 s28, s28, 8
; SI-NEXT: s_and_b32 s26, s26, 0xff
+; SI-NEXT: v_readlane_b32 s29, v62, 25
; SI-NEXT: s_or_b32 s26, s26, s28
; SI-NEXT: v_readlane_b32 s28, v62, 26
; SI-NEXT: v_readlane_b32 s29, v62, 27
@@ -192853,14 +192850,17 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: v_or_b32_e32 v27, v28, v27
; SI-NEXT: v_or_b32_e32 v27, s26, v27
; SI-NEXT: v_readlane_b32 s26, v62, 30
-; SI-NEXT: v_readlane_b32 s27, v62, 31
; SI-NEXT: s_lshl_b32 s26, s26, 8
; SI-NEXT: s_and_b32 s24, s24, 0xff
+; SI-NEXT: v_readlane_b32 vcc_hi, v62, 1
+; SI-NEXT: v_readlane_b32 s27, v62, 31
; SI-NEXT: s_or_b32 s24, s24, s26
; SI-NEXT: v_readlane_b32 s26, v62, 32
+; SI-NEXT: v_readlane_b32 vcc_hi, v62, 3
; SI-NEXT: v_readlane_b32 s27, v62, 33
; SI-NEXT: s_and_b32 s26, s26, 0xff
; SI-NEXT: v_readlane_b32 s28, v62, 34
+; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5
; SI-NEXT: s_lshl_b32 s27, s28, 24
; SI-NEXT: s_lshl_b32 s26, s26, 16
; SI-NEXT: v_readlane_b32 vcc_hi, v62, 11
@@ -192894,11 +192894,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0
-; SI-NEXT: v_readlane_b32 s25, v62, 37
; SI-NEXT: s_lshl_b32 s24, s24, 8
; SI-NEXT: s_and_b32 s22, s22, 0xff
; SI-NEXT: buffer_store_dword v24, v11, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0
+; SI-NEXT: v_readlane_b32 s25, v62, 37
; SI-NEXT: s_or_b32 s22, s22, s24
; SI-NEXT: v_readlane_b32 s24, v62, 38
; SI-NEXT: buffer_store_dword v25, v11, s[0:3], 0 offen
@@ -196410,7 +196410,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: ; implicit-def: $sgpr74
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: v_writelane_b32 v78, s42, 0
-; GFX11-NEXT: v_writelane_b32 v78, s43, 1
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
@@ -196421,8 +196420,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
+; GFX11-NEXT: v_writelane_b32 v78, s43, 1
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
+; GFX11-NEXT: ; implicit-def: $sgpr43
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
@@ -196485,7 +196486,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
-; GFX11-NEXT: ; implicit-def: $sgpr43
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: s_branch .LBB95_2
; GFX11-NEXT: .LBB95_4:
@@ -196568,9 +196568,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: v_readlane_b32 s0, v78, 18
; GFX11-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v26, s88
; GFX11-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s90
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_mov_b32_e32 v42, s0
; GFX11-NEXT: v_readlane_b32 s0, v78, 19
+; GFX11-NEXT: v_readlane_b32 s1, v78, 1
; GFX11-NEXT: v_dual_mov_b32 v17, s74 :: v_dual_mov_b32 v34, s92
; GFX11-NEXT: v_dual_mov_b32 v65, s94 :: v_dual_mov_b32 v68, s30
; GFX11-NEXT: v_mov_b32_e32 v41, s0
@@ -196630,10 +196631,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
; GFX11-NEXT: v_readlane_b32 s0, v77, 7
; GFX11-NEXT: v_mov_b32_e32 v149, s0
; GFX11-NEXT: v_readlane_b32 s0, v77, 8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v148, s0
; GFX11-NEXT: v_readlane_b32 s0, v78, 0
-; GFX11-NEXT: v_readlane_b32 s1, v78, 1
; GFX11-NEXT: v_mov_b32_e32 v82, s0
; GFX11-NEXT: .LBB95_5: ; %end
; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v74
@@ -214805,7 +214805,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_writelane_b32 v41, s17, 23
; SI-NEXT: s_lshr_b64 s[16:17], s[60:61], 16
; SI-NEXT: v_writelane_b32 v41, s16, 20
-; SI-NEXT: v_writelane_b32 v41, s17, 21
; SI-NEXT: s_lshr_b32 s16, s61, 24
; SI-NEXT: v_writelane_b32 v43, s16, 18
; SI-NEXT: s_lshr_b32 s16, s61, 16
@@ -214907,6 +214906,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_lshr_b32 s16, s5, 16
; SI-NEXT: v_writelane_b32 v42, s16, 0
; SI-NEXT: s_lshr_b32 s16, s5, 8
+; SI-NEXT: v_writelane_b32 v41, s17, 21
; SI-NEXT: v_writelane_b32 v42, s16, 1
; SI-NEXT: s_lshr_b64 s[16:17], s[56:57], 24
; SI-NEXT: v_writelane_b32 v41, s16, 28
@@ -215049,7 +215049,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_readlane_b32 s17, v41, 25
; SI-NEXT: s_lshl_b32 s17, s16, 8
; SI-NEXT: s_and_b32 s18, s56, 0xff
-; SI-NEXT: v_readlane_b32 s21, v41, 23
; SI-NEXT: s_or_b32 s17, s18, s17
; SI-NEXT: v_readlane_b32 s18, v41, 26
; SI-NEXT: v_readlane_b32 s19, v41, 27
@@ -215191,7 +215190,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v14, s17
; SI-NEXT: s_lshl_b32 s17, s90, 8
; SI-NEXT: s_and_b32 s18, s46, 0xff
-; SI-NEXT: v_readlane_b32 s21, v41, 29
; SI-NEXT: s_or_b32 s17, s18, s17
; SI-NEXT: v_readlane_b32 s18, v41, 30
; SI-NEXT: v_readlane_b32 s19, v41, 31
@@ -215210,82 +215208,78 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: s_and_b32 s18, s16, 0xff
; SI-NEXT: v_readlane_b32 s16, v43, 39
-; SI-NEXT: s_lshl_b32 s18, s18, 16
-; SI-NEXT: s_lshl_b32 s19, s16, 24
-; SI-NEXT: s_or_b32 s18, s19, s18
-; SI-NEXT: s_and_b32 s17, s17, 0xffff
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0
-; SI-NEXT: s_or_b32 s17, s17, s18
+; SI-NEXT: s_lshl_b32 s18, s18, 16
+; SI-NEXT: s_lshl_b32 s19, s16, 24
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0
-; SI-NEXT: v_mov_b32_e32 v16, s17
-; SI-NEXT: v_readlane_b32 s16, v41, 34
+; SI-NEXT: s_or_b32 s18, s19, s18
+; SI-NEXT: s_and_b32 s17, s17, 0xffff
; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0
-; SI-NEXT: v_readlane_b32 s17, v41, 35
-; SI-NEXT: v_readlane_b32 s18, v41, 36
+; SI-NEXT: s_or_b32 s17, s17, s18
; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0
-; SI-NEXT: s_lshl_b32 s16, s16, 8
-; SI-NEXT: s_and_b32 s17, s42, 0xff
-; SI-NEXT: v_readlane_b32 s19, v41, 37
+; SI-NEXT: v_mov_b32_e32 v16, s17
+; SI-NEXT: v_readlane_b32 s16, v41, 34
+; SI-NEXT: v_readlane_b32 s17, v41, 35
; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT: s_lshl_b32 s16, s16, 8
+; SI-NEXT: s_and_b32 s17, s42, 0xff
+; SI-NEXT: v_readlane_b32 s18, v41, 36
+; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0
; SI-NEXT: s_or_b32 s16, s17, s16
; SI-NEXT: s_and_b32 s17, s18, 0xff
; SI-NEXT: v_readlane_b32 s18, v41, 38
-; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen
-; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0
-; SI-NEXT: s_lshl_b32 s18, s18, 24
-; SI-NEXT: s_lshl_b32 s17, s17, 16
; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0
-; SI-NEXT: s_or_b32 s17, s18, s17
-; SI-NEXT: s_and_b32 s16, s16, 0xffff
+; SI-NEXT: s_lshl_b32 s18, s18, 24
+; SI-NEXT: s_lshl_b32 s17, s17, 16
; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0
-; SI-NEXT: s_or_b32 s16, s16, s17
+; SI-NEXT: s_or_b32 s17, s18, s17
+; SI-NEXT: s_and_b32 s16, s16, 0xffff
; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0
-; SI-NEXT: v_readlane_b32 s17, v43, 44
+; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT: v_readlane_b32 s17, v43, 44
+; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen
+; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0
; SI-NEXT: v_mov_b32_e32 v2, s16
; SI-NEXT: s_and_b32 s16, s43, 0xff
; SI-NEXT: s_lshl_b32 s17, s17, 8
-; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen
-; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0
-; SI-NEXT: s_or_b32 s16, s16, s17
-; SI-NEXT: v_readlane_b32 s17, v43, 43
; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0
-; SI-NEXT: s_and_b32 s17, s17, 0xff
-; SI-NEXT: v_readlane_b32 s18, v43, 42
-; SI-NEXT: v_readlane_b32 s19, v41, 39
+; SI-NEXT: s_or_b32 s16, s16, s17
+; SI-NEXT: v_readlane_b32 s17, v43, 43
; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0
-; SI-NEXT: s_lshl_b32 s17, s17, 16
-; SI-NEXT: s_lshl_b32 s18, s18, 24
+; SI-NEXT: s_and_b32 s17, s17, 0xff
+; SI-NEXT: v_readlane_b32 s18, v43, 42
; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0
-; SI-NEXT: s_or_b32 s17, s18, s17
-; SI-NEXT: v_readlane_b32 s18, v41, 40
+; SI-NEXT: s_lshl_b32 s17, s17, 16
+; SI-NEXT: s_lshl_b32 s18, s18, 24
; SI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0
; SI-NEXT: s_and_b32 s16, s16, 0xffff
-; SI-NEXT: v_readlane_b32 s19, v41, 41
+; SI-NEXT: s_or_b32 s17, s18, s17
; SI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0
; SI-NEXT: s_or_b32 s16, s16, s17
-; SI-NEXT: s_lshl_b32 s17, s18, 8
-; SI-NEXT: v_readlane_b32 s18, v41, 42
+; SI-NEXT: v_readlane_b32 s18, v41, 40
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s16
; SI-NEXT: s_and_b32 s16, s40, 0xff
-; SI-NEXT: v_readlane_b32 s19, v41, 43
+; SI-NEXT: s_lshl_b32 s17, s18, 8
+; SI-NEXT: v_readlane_b32 s18, v41, 42
; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: s_and_b32 s17, s18, 0xff
; SI-NEXT: v_readlane_b32 s18, v41, 44
@@ -215316,9 +215310,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s16
; SI-NEXT: v_readlane_b32 s16, v41, 46
; SI-NEXT: s_and_b32 s14, s14, 0xff
-; SI-NEXT: v_readlane_b32 s17, v41, 47
; SI-NEXT: s_lshl_b32 s16, s16, 8
-; SI-NEXT: v_readlane_b32 s19, v41, 45
+; SI-NEXT: v_readlane_b32 s17, v41, 47
; SI-NEXT: s_or_b32 s14, s14, s16
; SI-NEXT: v_readlane_b32 s16, v41, 48
; SI-NEXT: v_readlane_b32 s17, v41, 49
@@ -215351,8 +215344,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s14
; SI-NEXT: v_readlane_b32 s14, v41, 52
; SI-NEXT: s_and_b32 s12, s12, 0xff
-; SI-NEXT: v_readlane_b32 s15, v41, 53
; SI-NEXT: s_lshl_b32 s14, s14, 8
+; SI-NEXT: v_readlane_b32 s15, v41, 53
; SI-NEXT: s_or_b32 s12, s12, s14
; SI-NEXT: v_readlane_b32 s14, v41, 54
; SI-NEXT: v_readlane_b32 s15, v41, 55
@@ -215385,8 +215378,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s12
; SI-NEXT: v_readlane_b32 s12, v41, 58
; SI-NEXT: s_and_b32 s10, s10, 0xff
-; SI-NEXT: v_readlane_b32 s13, v41, 59
; SI-NEXT: s_lshl_b32 s12, s12, 8
+; SI-NEXT: v_readlane_b32 s13, v41, 59
; SI-NEXT: s_or_b32 s10, s10, s12
; SI-NEXT: v_readlane_b32 s12, v41, 60
; SI-NEXT: v_readlane_b32 s13, v41, 61
@@ -215419,8 +215412,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s10
; SI-NEXT: v_readlane_b32 s10, v43, 0
; SI-NEXT: s_and_b32 s8, s8, 0xff
-; SI-NEXT: v_readlane_b32 s11, v43, 1
; SI-NEXT: s_lshl_b32 s10, s10, 8
+; SI-NEXT: v_readlane_b32 s11, v43, 1
; SI-NEXT: s_or_b32 s8, s8, s10
; SI-NEXT: v_readlane_b32 s10, v43, 2
; SI-NEXT: v_readlane_b32 s11, v43, 3
@@ -215453,8 +215446,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s8
; SI-NEXT: v_readlane_b32 s8, v43, 6
; SI-NEXT: s_and_b32 s6, s6, 0xff
-; SI-NEXT: v_readlane_b32 s9, v43, 7
; SI-NEXT: s_lshl_b32 s8, s8, 8
+; SI-NEXT: v_readlane_b32 s9, v43, 7
; SI-NEXT: s_or_b32 s6, s6, s8
; SI-NEXT: v_readlane_b32 s8, v43, 8
; SI-NEXT: v_readlane_b32 s9, v43, 9
@@ -215487,8 +215480,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: v_readlane_b32 s6, v43, 12
; SI-NEXT: s_and_b32 s4, s4, 0xff
-; SI-NEXT: v_readlane_b32 s7, v43, 13
; SI-NEXT: s_lshl_b32 s6, s6, 8
+; SI-NEXT: v_readlane_b32 s7, v43, 13
; SI-NEXT: s_or_b32 s4, s4, s6
; SI-NEXT: v_readlane_b32 s6, v43, 14
; SI-NEXT: v_readlane_b32 s7, v43, 15
@@ -215508,14 +215501,21 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_lshl_b32 s5, s5, 8
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: v_readlane_b32 s5, v42, 0
+; SI-NEXT: v_readlane_b32 s19, v41, 37
; SI-NEXT: s_and_b32 s5, s5, 0xff
; SI-NEXT: v_readlane_b32 s6, v43, 63
+; SI-NEXT: v_readlane_b32 s19, v41, 39
; SI-NEXT: s_lshl_b32 s5, s5, 16
; SI-NEXT: s_lshl_b32 s6, s6, 24
+; SI-NEXT: v_readlane_b32 s19, v41, 41
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s6, s5
+; SI-NEXT: v_readlane_b32 s21, v41, 23
+; SI-NEXT: v_readlane_b32 s19, v41, 43
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0
; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_readlane_b32 s21, v41, 29
+; SI-NEXT: v_readlane_b32 s19, v41, 45
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: v_mov_b32_e32 v1, s4
@@ -215573,47 +215573,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB99_4:
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: ; implicit-def: $sgpr16
; SI-NEXT: v_writelane_b32 v41, s4, 20
; SI-NEXT: v_writelane_b32 v41, s5, 21
; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: ; implicit-def: $sgpr60
-; SI-NEXT: ; implicit-def: $sgpr92
-; SI-NEXT: ; implicit-def: $sgpr56
-; SI-NEXT: ; implicit-def: $sgpr44
-; SI-NEXT: ; implicit-def: $sgpr82
-; SI-NEXT: ; implicit-def: $sgpr80
-; SI-NEXT: ; implicit-def: $sgpr70
-; SI-NEXT: ; implicit-def: $sgpr74
-; SI-NEXT: ; implicit-def: $sgpr68
-; SI-NEXT: ; implicit-def: $sgpr66
-; SI-NEXT: ; implicit-def: $sgpr64
-; SI-NEXT: ; implicit-def: $sgpr72
-; SI-NEXT: ; implicit-def: $sgpr54
-; SI-NEXT: ; implicit-def: $sgpr52
-; SI-NEXT: ; implicit-def: $sgpr50
-; SI-NEXT: ; implicit-def: $sgpr62
-; SI-NEXT: ; implicit-def: $sgpr48
-; SI-NEXT: ; implicit-def: $sgpr38
-; SI-NEXT: ; implicit-def: $sgpr36
-; SI-NEXT: ; implicit-def: $sgpr58
-; SI-NEXT: ; implicit-def: $sgpr34
-; SI-NEXT: ; implicit-def: $sgpr30
-; SI-NEXT: ; implicit-def: $sgpr94
-; SI-NEXT: ; implicit-def: $sgpr46
-; SI-NEXT: ; implicit-def: $sgpr90
-; SI-NEXT: ; implicit-def: $sgpr42
-; SI-NEXT: ; implicit-def: $sgpr40
-; SI-NEXT: ; implicit-def: $sgpr14
-; SI-NEXT: ; implicit-def: $sgpr12
-; SI-NEXT: ; implicit-def: $sgpr10
-; SI-NEXT: ; implicit-def: $sgpr8
-; SI-NEXT: ; implicit-def: $sgpr6
; SI-NEXT: v_writelane_b32 v41, s4, 22
-; SI-NEXT: v_writelane_b32 v41, s5, 23
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: v_writelane_b32 v41, s5, 23
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: ; kill: killed $sgpr4
@@ -215625,7 +215592,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_writelane_b32 v41, s5, 27
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: v_writelane_b32 v41, s4, 28
-; SI-NEXT: v_writelane_b32 v41, s5, 29
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
@@ -215713,6 +215679,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: v_writelane_b32 v41, s5, 29
; SI-NEXT: ; kill: killed $sgpr4
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: ; kill: killed $sgpr4
@@ -215766,7 +215733,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: v_writelane_b32 v41, s5, 61
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: v_writelane_b32 v41, s4, 62
-; SI-NEXT: v_writelane_b32 v41, s5, 63
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v43, s4, 0
@@ -215786,14 +215752,48 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: v_writelane_b32 v43, s4, 10
; SI-NEXT: v_writelane_b32 v43, s5, 11
+; SI-NEXT: ; implicit-def: $sgpr16
; SI-NEXT: v_writelane_b32 v43, s16, 12
; SI-NEXT: v_writelane_b32 v43, s17, 13
; SI-NEXT: ; implicit-def: $sgpr16
-; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: v_writelane_b32 v43, s16, 14
; SI-NEXT: v_writelane_b32 v43, s17, 15
; SI-NEXT: ; implicit-def: $sgpr16
; SI-NEXT: v_writelane_b32 v43, s16, 16
+; SI-NEXT: ; implicit-def: $sgpr60
+; SI-NEXT: ; implicit-def: $sgpr92
+; SI-NEXT: ; implicit-def: $sgpr56
+; SI-NEXT: ; implicit-def: $sgpr44
+; SI-NEXT: ; implicit-def: $sgpr82
+; SI-NEXT: ; implicit-def: $sgpr80
+; SI-NEXT: ; implicit-def: $sgpr70
+; SI-NEXT: ; implicit-def: $sgpr74
+; SI-NEXT: ; implicit-def: $sgpr68
+; SI-NEXT: ; implicit-def: $sgpr66
+; SI-NEXT: ; implicit-def: $sgpr64
+; SI-NEXT: ; implicit-def: $sgpr72
+; SI-NEXT: ; implicit-def: $sgpr54
+; SI-NEXT: ; implicit-def: $sgpr52
+; SI-NEXT: ; implicit-def: $sgpr50
+; SI-NEXT: ; implicit-def: $sgpr62
+; SI-NEXT: ; implicit-def: $sgpr48
+; SI-NEXT: ; implicit-def: $sgpr38
+; SI-NEXT: ; implicit-def: $sgpr36
+; SI-NEXT: ; implicit-def: $sgpr58
+; SI-NEXT: ; implicit-def: $sgpr34
+; SI-NEXT: ; implicit-def: $sgpr30
+; SI-NEXT: ; implicit-def: $sgpr94
+; SI-NEXT: ; implicit-def: $sgpr46
+; SI-NEXT: ; implicit-def: $sgpr90
+; SI-NEXT: ; implicit-def: $sgpr42
+; SI-NEXT: ; implicit-def: $sgpr40
+; SI-NEXT: ; implicit-def: $sgpr14
+; SI-NEXT: ; implicit-def: $sgpr12
+; SI-NEXT: ; implicit-def: $sgpr10
+; SI-NEXT: v_writelane_b32 v41, s5, 63
+; SI-NEXT: ; implicit-def: $sgpr8
+; SI-NEXT: ; implicit-def: $sgpr6
+; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: v_writelane_b32 v43, s17, 17
; SI-NEXT: s_branch .LBB99_2
;
@@ -218766,7 +218766,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: ; implicit-def: $sgpr74
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: v_writelane_b32 v78, s42, 0
-; GFX11-NEXT: v_writelane_b32 v78, s43, 1
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
@@ -218777,8 +218776,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
+; GFX11-NEXT: v_writelane_b32 v78, s43, 1
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
+; GFX11-NEXT: ; implicit-def: $sgpr43
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
@@ -218841,7 +218842,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: ; kill: killed $sgpr42
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: ; kill: killed $sgpr42
-; GFX11-NEXT: ; implicit-def: $sgpr43
; GFX11-NEXT: ; implicit-def: $sgpr42
; GFX11-NEXT: s_branch .LBB99_2
; GFX11-NEXT: .LBB99_4:
@@ -218924,9 +218924,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: v_readlane_b32 s0, v78, 18
; GFX11-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v26, s88
; GFX11-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s90
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_mov_b32_e32 v42, s0
; GFX11-NEXT: v_readlane_b32 s0, v78, 19
+; GFX11-NEXT: v_readlane_b32 s1, v78, 1
; GFX11-NEXT: v_dual_mov_b32 v17, s74 :: v_dual_mov_b32 v34, s92
; GFX11-NEXT: v_dual_mov_b32 v65, s94 :: v_dual_mov_b32 v68, s30
; GFX11-NEXT: v_mov_b32_e32 v41, s0
@@ -218986,10 +218987,9 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
; GFX11-NEXT: v_readlane_b32 s0, v77, 7
; GFX11-NEXT: v_mov_b32_e32 v149, s0
; GFX11-NEXT: v_readlane_b32 s0, v77, 8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v148, s0
; GFX11-NEXT: v_readlane_b32 s0, v78, 0
-; GFX11-NEXT: v_readlane_b32 s1, v78, 1
; GFX11-NEXT: v_mov_b32_e32 v82, s0
; GFX11-NEXT: .LBB99_5: ; %end
; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v74
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index a0c596ff9d5de..a7f89579b5ce0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -66747,11 +66747,9 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; SI-NEXT: v_writelane_b32 v21, s17, 13
; SI-NEXT: .LBB97_3: ; %end
; SI-NEXT: v_readlane_b32 s18, v21, 0
-; SI-NEXT: v_readlane_b32 s19, v21, 1
+; SI-NEXT: s_and_b32 s16, s40, 0xff
; SI-NEXT: s_lshl_b32 s17, s18, 8
; SI-NEXT: v_readlane_b32 s18, v21, 2
-; SI-NEXT: s_and_b32 s16, s40, 0xff
-; SI-NEXT: v_readlane_b32 s19, v21, 3
; SI-NEXT: s_or_b32 s16, s16, s17
; SI-NEXT: s_and_b32 s17, s18, 0xff
; SI-NEXT: v_readlane_b32 s18, v21, 4
@@ -66773,9 +66771,8 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; SI-NEXT: v_mov_b32_e32 v2, s16
; SI-NEXT: v_readlane_b32 s16, v21, 6
; SI-NEXT: s_and_b32 s14, s14, 0xff
-; SI-NEXT: v_readlane_b32 s17, v21, 7
; SI-NEXT: s_lshl_b32 s16, s16, 8
-; SI-NEXT: v_readlane_b32 s19, v21, 5
+; SI-NEXT: v_readlane_b32 s17, v21, 7
; SI-NEXT: s_or_b32 s14, s14, s16
; SI-NEXT: v_readlane_b32 s16, v21, 8
; SI-NEXT: v_readlane_b32 s17, v21, 9
@@ -66807,8 +66804,8 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; SI-NEXT: v_mov_b32_e32 v2, s14
; SI-NEXT: v_readlane_b32 s14, v21, 12
; SI-NEXT: s_and_b32 s10, s10, 0xff
-; SI-NEXT: v_readlane_b32 s15, v21, 13
; SI-NEXT: s_lshl_b32 s14, s14, 8
+; SI-NEXT: v_readlane_b32 s15, v21, 13
; SI-NEXT: s_or_b32 s10, s10, s14
; SI-NEXT: v_readlane_b32 s14, v21, 14
; SI-NEXT: v_readlane_b32 s15, v21, 15
@@ -66959,10 +66956,13 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; SI-NEXT: s_and_b32 s5, s89, 0xff
; SI-NEXT: s_lshl_b32 s5, s5, 16
; SI-NEXT: s_lshl_b32 s6, s91, 24
+; SI-NEXT: v_readlane_b32 s19, v21, 1
; SI-NEXT: s_and_b32 s4, s4, 0xffff
; SI-NEXT: s_or_b32 s5, s6, s5
+; SI-NEXT: v_readlane_b32 s19, v21, 3
; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0
; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: v_readlane_b32 s19, v21, 5
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0
; SI-NEXT: v_mov_b32_e32 v1, s4
@@ -67017,6 +67017,28 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; SI-NEXT: v_writelane_b32 v21, s4, 0
; SI-NEXT: v_writelane_b32 v21, s5, 1
; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: v_writelane_b32 v21, s4, 2
+; SI-NEXT: v_writelane_b32 v21, s5, 3
+; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: v_writelane_b32 v21, s4, 4
+; SI-NEXT: v_writelane_b32 v21, s5, 5
+; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: v_writelane_b32 v21, s4, 6
+; SI-NEXT: v_writelane_b32 v21, s5, 7
+; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: v_writelane_b32 v21, s4, 8
+; SI-NEXT: v_writelane_b32 v21, s5, 9
+; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: v_writelane_b32 v21, s4, 10
+; SI-NEXT: v_writelane_b32 v21, s5, 11
+; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: v_writelane_b32 v21, s4, 12
+; SI-NEXT: v_writelane_b32 v21, s5, 13
+; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: v_writelane_b32 v21, s4, 14
+; SI-NEXT: v_writelane_b32 v21, s5, 15
+; SI-NEXT: ; implicit-def: $sgpr4
+; SI-NEXT: v_writelane_b32 v21, s4, 16
; SI-NEXT: ; implicit-def: $sgpr40
; SI-NEXT: ; implicit-def: $sgpr60
; SI-NEXT: ; implicit-def: $sgpr74
@@ -67044,6 +67066,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; SI-NEXT: ; implicit-def: $sgpr79
; SI-NEXT: ; implicit-def: $sgpr89
; SI-NEXT: ; implicit-def: $sgpr91
+; SI-NEXT: v_writelane_b32 v21, s5, 17
; SI-NEXT: ; implicit-def: $sgpr42
; SI-NEXT: ; implicit-def: $sgpr66
; SI-NEXT: ; implicit-def: $sgpr64
@@ -67060,33 +67083,10 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
; SI-NEXT: ; implicit-def: $sgpr30
; SI-NEXT: ; implicit-def: $sgpr94
; SI-NEXT: ; implicit-def: $sgpr92
+; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: ; implicit-def: $sgpr90
; SI-NEXT: ; implicit-def: $sgpr88
; SI-NEXT: ; implicit-def: $sgpr78
-; SI-NEXT: v_writelane_b32 v21, s4, 2
-; SI-NEXT: v_writelane_b32 v21, s5, 3
-; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v21, s4, 4
-; SI-NEXT: v_writelane_b32 v21, s5, 5
-; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v21, s4, 6
-; SI-NEXT: v_writelane_b32 v21, s5, 7
-; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v21, s4, 8
-; SI-NEXT: v_writelane_b32 v21, s5, 9
-; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v21, s4, 10
-; SI-NEXT: v_writelane_b32 v21, s5, 11
-; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v21, s4, 12
-; SI-NEXT: v_writelane_b32 v21, s5, 13
-; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v21, s4, 14
-; SI-NEXT: v_writelane_b32 v21, s5, 15
-; SI-NEXT: ; implicit-def: $sgpr4
-; SI-NEXT: v_writelane_b32 v21, s4, 16
-; SI-NEXT: v_writelane_b32 v21, s5, 17
-; SI-NEXT: ; implicit-def: $sgpr4
; SI-NEXT: s_branch .LBB97_2
;
; VI-LABEL: bitcast_v32i16_to_v64i8_scalar:
@@ -88410,8 +88410,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 24
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_writelane_b32 v41, s4, 0
-; SI-NEXT: v_writelane_b32 v41, s5, 1
; SI-NEXT: v_readfirstlane_b32 s4, v6
+; SI-NEXT: v_writelane_b32 v41, s5, 1
; SI-NEXT: s_lshr_b32 s5, s4, 16
; SI-NEXT: v_readfirstlane_b32 s4, v7
; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
index 37cbd2d926413..34abba10f6c61 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
@@ -150,8 +150,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: v_readlane_b32 s0, v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, vcc_lo
+; GCN-NEXT: v_readlane_b32 s0, v0, 0
; GCN-NEXT: v_readlane_b32 s1, v0, 1
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir
index 5f36d5403ebcf..744871d8c84ff 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir
@@ -65,12 +65,12 @@ body: |
; CHECK: S_NOP 0, implicit-def $exec
; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF
- ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
- ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1
- ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
+ ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
+ ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0
+ ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1
- ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
+ ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
@@ -141,12 +141,12 @@ body: |
; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec
; CHECK: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec
; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF
- ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
- ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1
- ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
+ ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
+ ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0
+ ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1
- ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
+ ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 76f204dd0c16a..420f003d4f417 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -41,10 +41,10 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x130
; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
; CHECK-NEXT: v_writelane_b32 v6, s70, 20
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_writelane_b32 v6, s71, 21
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_writelane_b32 v7, s8, 0
; CHECK-NEXT: v_writelane_b32 v7, s9, 1
; CHECK-NEXT: v_writelane_b32 v7, s10, 2
@@ -76,15 +76,14 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v7, s64, 28
; CHECK-NEXT: v_writelane_b32 v7, s65, 29
; CHECK-NEXT: v_writelane_b32 v7, s66, 30
+; CHECK-NEXT: v_writelane_b32 v7, s67, 31
; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x1f0
; CHECK-NEXT: s_load_dwordx16 s[36:51], s[68:69], 0x2f0
; CHECK-NEXT: s_mov_b32 s69, s68
; CHECK-NEXT: s_mov_b32 s70, s68
; CHECK-NEXT: s_mov_b32 s71, s68
-; CHECK-NEXT: v_writelane_b32 v7, s67, 31
-; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1
-; CHECK-NEXT: v_readlane_b32 s52, v7, 0
; CHECK-NEXT: v_mov_b32_e32 v3, v2
+; CHECK-NEXT: v_readlane_b32 s52, v7, 0
; CHECK-NEXT: v_readlane_b32 s53, v7, 1
; CHECK-NEXT: v_readlane_b32 s54, v7, 2
; CHECK-NEXT: v_readlane_b32 s55, v7, 3
@@ -92,12 +91,13 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_readlane_b32 s57, v7, 5
; CHECK-NEXT: v_readlane_b32 s58, v7, 6
; CHECK-NEXT: v_readlane_b32 s59, v7, 7
+; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1
; CHECK-NEXT: v_and_b32_e32 v5, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v5
; CHECK-NEXT: v_readlane_b32 s60, v7, 8
; CHECK-NEXT: v_readlane_b32 s61, v7, 9
-; CHECK-NEXT: v_readlane_b32 s62, v7, 10
; CHECK-NEXT: image_sample_lz v4, v[2:3], s[52:59], s[68:71] dmask:0x1
+; CHECK-NEXT: v_readlane_b32 s62, v7, 10
; CHECK-NEXT: v_readlane_b32 s63, v7, 11
; CHECK-NEXT: v_readlane_b32 s64, v7, 12
; CHECK-NEXT: v_readlane_b32 s65, v7, 13
@@ -109,7 +109,6 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; CHECK-NEXT: s_cbranch_execz .LBB0_3
; CHECK-NEXT: ; %bb.1: ; %bb48
-; CHECK-NEXT: v_readlane_b32 s52, v7, 16
; CHECK-NEXT: v_readlane_b32 s60, v7, 24
; CHECK-NEXT: v_readlane_b32 s61, v7, 25
; CHECK-NEXT: v_readlane_b32 s62, v7, 26
@@ -120,10 +119,11 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_readlane_b32 s67, v7, 31
; CHECK-NEXT: v_mov_b32_e32 v1, v2
; CHECK-NEXT: s_and_b64 vcc, exec, -1
+; CHECK-NEXT: v_readlane_b32 s52, v7, 16
; CHECK-NEXT: v_readlane_b32 s53, v7, 17
; CHECK-NEXT: v_readlane_b32 s54, v7, 18
-; CHECK-NEXT: v_readlane_b32 s55, v7, 19
; CHECK-NEXT: image_sample_lz v3, v[2:3], s[60:67], s[68:71] dmask:0x1
+; CHECK-NEXT: v_readlane_b32 s55, v7, 19
; CHECK-NEXT: v_readlane_b32 s56, v7, 20
; CHECK-NEXT: v_readlane_b32 s57, v7, 21
; CHECK-NEXT: v_readlane_b32 s58, v7, 22
@@ -152,10 +152,18 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: s_mov_b32 s16, 0
; CHECK-NEXT: s_mov_b32 s17, s16
; CHECK-NEXT: v_mov_b32_e32 v0, s16
-; CHECK-NEXT: v_readlane_b32 s44, v7, 16
+; CHECK-NEXT: v_readlane_b32 s52, v7, 24
+; CHECK-NEXT: v_readlane_b32 s53, v7, 25
+; CHECK-NEXT: v_readlane_b32 s54, v7, 26
+; CHECK-NEXT: v_readlane_b32 s55, v7, 27
+; CHECK-NEXT: v_readlane_b32 s56, v7, 28
+; CHECK-NEXT: v_readlane_b32 s57, v7, 29
+; CHECK-NEXT: v_readlane_b32 s58, v7, 30
+; CHECK-NEXT: v_readlane_b32 s59, v7, 31
; CHECK-NEXT: v_mov_b32_e32 v1, s17
; CHECK-NEXT: s_mov_b32 s18, s16
; CHECK-NEXT: s_mov_b32 s19, s16
+; CHECK-NEXT: v_readlane_b32 s44, v7, 16
; CHECK-NEXT: v_readlane_b32 s45, v7, 17
; CHECK-NEXT: v_readlane_b32 s46, v7, 18
; CHECK-NEXT: v_readlane_b32 s47, v7, 19
@@ -163,16 +171,6 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_readlane_b32 s49, v7, 21
; CHECK-NEXT: v_readlane_b32 s50, v7, 22
; CHECK-NEXT: v_readlane_b32 s51, v7, 23
-; CHECK-NEXT: v_readlane_b32 s52, v7, 24
-; CHECK-NEXT: v_readlane_b32 s53, v7, 25
-; CHECK-NEXT: v_readlane_b32 s54, v7, 26
-; CHECK-NEXT: v_readlane_b32 s55, v7, 27
-; CHECK-NEXT: v_readlane_b32 s56, v7, 28
-; CHECK-NEXT: v_readlane_b32 s57, v7, 29
-; CHECK-NEXT: v_readlane_b32 s58, v7, 30
-; CHECK-NEXT: v_readlane_b32 s59, v7, 31
-; CHECK-NEXT: image_sample_lz v2, v[0:1], s[44:51], s[16:19] dmask:0x1
-; CHECK-NEXT: v_readlane_b32 s44, v7, 0
; CHECK-NEXT: v_readlane_b32 s52, v7, 8
; CHECK-NEXT: v_readlane_b32 s53, v7, 9
; CHECK-NEXT: v_readlane_b32 s54, v7, 10
@@ -181,12 +179,14 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_readlane_b32 s57, v7, 13
; CHECK-NEXT: v_readlane_b32 s58, v7, 14
; CHECK-NEXT: v_readlane_b32 s59, v7, 15
+; CHECK-NEXT: image_sample_lz v2, v[0:1], s[44:51], s[16:19] dmask:0x1
; CHECK-NEXT: v_mov_b32_e32 v3, 0
; CHECK-NEXT: v_mov_b32_e32 v4, v3
+; CHECK-NEXT: v_readlane_b32 s44, v7, 0
; CHECK-NEXT: v_readlane_b32 s45, v7, 1
+; CHECK-NEXT: image_sample_lz v0, v[0:1], s[52:59], s[24:27] dmask:0x1
; CHECK-NEXT: v_readlane_b32 s46, v7, 2
; CHECK-NEXT: v_readlane_b32 s47, v7, 3
-; CHECK-NEXT: image_sample_lz v0, v[0:1], s[52:59], s[24:27] dmask:0x1
; CHECK-NEXT: v_readlane_b32 s48, v7, 4
; CHECK-NEXT: v_readlane_b32 s49, v7, 5
; CHECK-NEXT: v_readlane_b32 s50, v7, 6
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index e1b4cad370f96..4a89b2fcc017c 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -1826,10 +1826,10 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
; GCN-NEXT: s_or_b32 s0, s0, s5
; GCN-NEXT: s_and_b32 s0, s0, 0xffff
; GCN-NEXT: s_or_b32 s0, s0, s4
-; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: v_readlane_b32 s0, v6, 0
; GCN-NEXT: v_readlane_b32 s1, v6, 1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_readlane_b32 s0, v6, 0
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 83c240c17ff1c..9fdc72f054f90 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -10279,11 +10279,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_writelane_b32 v62, s3, 5
; GFX8-NEXT: v_readlane_b32 s2, v62, 2
; GFX8-NEXT: v_readlane_b32 s3, v62, 3
+; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000
; GFX8-NEXT: v_mov_b32_e32 v35, s49
; GFX8-NEXT: s_bfe_i64 s[48:49], s[4:5], 0x10000
; GFX8-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
; GFX8-NEXT: v_readlane_b32 s2, v62, 0
-; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000
; GFX8-NEXT: v_readlane_b32 s3, v62, 1
; GFX8-NEXT: v_mov_b32_e32 v5, s75
; GFX8-NEXT: v_mov_b32_e32 v13, s73
@@ -10577,8 +10577,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v2, s34
; GFX8-NEXT: v_mov_b32_e32 v3, s35
; GFX8-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NEXT: v_readlane_b32 s2, v62, 4
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: v_readlane_b32 s2, v62, 4
; GFX8-NEXT: v_readlane_b32 s3, v62, 5
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s30
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index 75638c5fa8476..58375b6f8a8a4 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -139,13 +139,13 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: s_mov_b64 exec, s[34:35]
-; GFX906-NEXT: v_readlane_b32 s16, v39, 22
; GFX906-NEXT: s_mov_b32 s12, s24
; GFX906-NEXT: s_mov_b32 s13, s23
; GFX906-NEXT: s_mov_b32 s14, s22
; GFX906-NEXT: v_mov_b32_e32 v31, v32
; GFX906-NEXT: s_mov_b32 s15, s21
; GFX906-NEXT: s_mov_b64 s[10:11], s[26:27]
+; GFX906-NEXT: v_readlane_b32 s16, v39, 22
; GFX906-NEXT: v_readlane_b32 s17, v39, 23
; GFX906-NEXT: v_mov_b32_e32 v40, v32
; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17]
@@ -232,20 +232,20 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: s_mov_b64 exec, s[34:35]
-; GFX906-NEXT: v_readlane_b32 s4, v39, 10
-; GFX906-NEXT: v_readlane_b32 s6, v39, 8
-; GFX906-NEXT: v_readlane_b32 s8, v39, 6
-; GFX906-NEXT: v_readlane_b32 s10, v39, 4
-; GFX906-NEXT: v_readlane_b32 s16, v39, 22
; GFX906-NEXT: v_readlane_b32 s12, v39, 3
; GFX906-NEXT: v_mov_b32_e32 v31, v40
; GFX906-NEXT: v_readlane_b32 s13, v39, 2
; GFX906-NEXT: v_readlane_b32 s14, v39, 1
; GFX906-NEXT: v_readlane_b32 s15, v39, 0
+; GFX906-NEXT: v_readlane_b32 s4, v39, 10
; GFX906-NEXT: v_readlane_b32 s5, v39, 11
+; GFX906-NEXT: v_readlane_b32 s6, v39, 8
; GFX906-NEXT: v_readlane_b32 s7, v39, 9
+; GFX906-NEXT: v_readlane_b32 s8, v39, 6
; GFX906-NEXT: v_readlane_b32 s9, v39, 7
+; GFX906-NEXT: v_readlane_b32 s10, v39, 4
; GFX906-NEXT: v_readlane_b32 s11, v39, 5
+; GFX906-NEXT: v_readlane_b32 s16, v39, 22
; GFX906-NEXT: v_readlane_b32 s17, v39, 23
; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1
@@ -253,19 +253,19 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: s_mov_b64 exec, s[34:35]
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_readlane_b32 s4, v39, 10
-; GFX906-NEXT: v_readlane_b32 s6, v39, 8
-; GFX906-NEXT: v_readlane_b32 s8, v39, 6
-; GFX906-NEXT: v_readlane_b32 s10, v39, 4
-; GFX906-NEXT: v_readlane_b32 s16, v39, 22
; GFX906-NEXT: v_readlane_b32 s5, v39, 11
+; GFX906-NEXT: v_readlane_b32 s6, v39, 8
; GFX906-NEXT: v_readlane_b32 s7, v39, 9
+; GFX906-NEXT: v_readlane_b32 s8, v39, 6
; GFX906-NEXT: v_readlane_b32 s9, v39, 7
+; GFX906-NEXT: v_readlane_b32 s10, v39, 4
; GFX906-NEXT: v_readlane_b32 s11, v39, 5
; GFX906-NEXT: v_readlane_b32 s12, v39, 3
; GFX906-NEXT: v_readlane_b32 s13, v39, 2
; GFX906-NEXT: v_readlane_b32 s14, v39, 1
; GFX906-NEXT: v_readlane_b32 s15, v39, 0
; GFX906-NEXT: v_mov_b32_e32 v31, v40
+; GFX906-NEXT: v_readlane_b32 s16, v39, 22
; GFX906-NEXT: v_readlane_b32 s17, v39, 23
; GFX906-NEXT: v_readlane_b32 s21, v39, 12
; GFX906-NEXT: ;;#ASMSTART
@@ -528,13 +528,13 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX908-NEXT: s_mov_b64 exec, s[34:35]
-; GFX908-NEXT: v_readlane_b32 s16, v39, 22
; GFX908-NEXT: s_mov_b32 s12, s24
; GFX908-NEXT: s_mov_b32 s13, s23
; GFX908-NEXT: s_mov_b32 s14, s22
; GFX908-NEXT: v_mov_b32_e32 v31, v32
; GFX908-NEXT: s_mov_b32 s15, s21
; GFX908-NEXT: s_mov_b64 s[10:11], s[26:27]
+; GFX908-NEXT: v_readlane_b32 s16, v39, 22
; GFX908-NEXT: v_readlane_b32 s17, v39, 23
; GFX908-NEXT: v_mov_b32_e32 v40, v32
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
@@ -621,20 +621,20 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX908-NEXT: s_mov_b64 exec, s[34:35]
-; GFX908-NEXT: v_readlane_b32 s4, v39, 10
-; GFX908-NEXT: v_readlane_b32 s6, v39, 8
-; GFX908-NEXT: v_readlane_b32 s8, v39, 6
-; GFX908-NEXT: v_readlane_b32 s10, v39, 4
-; GFX908-NEXT: v_readlane_b32 s16, v39, 22
; GFX908-NEXT: v_readlane_b32 s12, v39, 3
; GFX908-NEXT: v_mov_b32_e32 v31, v40
; GFX908-NEXT: v_readlane_b32 s13, v39, 2
; GFX908-NEXT: v_readlane_b32 s14, v39, 1
; GFX908-NEXT: v_readlane_b32 s15, v39, 0
+; GFX908-NEXT: v_readlane_b32 s4, v39, 10
; GFX908-NEXT: v_readlane_b32 s5, v39, 11
+; GFX908-NEXT: v_readlane_b32 s6, v39, 8
; GFX908-NEXT: v_readlane_b32 s7, v39, 9
+; GFX908-NEXT: v_readlane_b32 s8, v39, 6
; GFX908-NEXT: v_readlane_b32 s9, v39, 7
+; GFX908-NEXT: v_readlane_b32 s10, v39, 4
; GFX908-NEXT: v_readlane_b32 s11, v39, 5
+; GFX908-NEXT: v_readlane_b32 s16, v39, 22
; GFX908-NEXT: v_readlane_b32 s17, v39, 23
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1
@@ -642,19 +642,19 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: s_mov_b64 exec, s[34:35]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readlane_b32 s4, v39, 10
-; GFX908-NEXT: v_readlane_b32 s6, v39, 8
-; GFX908-NEXT: v_readlane_b32 s8, v39, 6
-; GFX908-NEXT: v_readlane_b32 s10, v39, 4
-; GFX908-NEXT: v_readlane_b32 s16, v39, 22
; GFX908-NEXT: v_readlane_b32 s5, v39, 11
+; GFX908-NEXT: v_readlane_b32 s6, v39, 8
; GFX908-NEXT: v_readlane_b32 s7, v39, 9
+; GFX908-NEXT: v_readlane_b32 s8, v39, 6
; GFX908-NEXT: v_readlane_b32 s9, v39, 7
+; GFX908-NEXT: v_readlane_b32 s10, v39, 4
; GFX908-NEXT: v_readlane_b32 s11, v39, 5
; GFX908-NEXT: v_readlane_b32 s12, v39, 3
; GFX908-NEXT: v_readlane_b32 s13, v39, 2
; GFX908-NEXT: v_readlane_b32 s14, v39, 1
; GFX908-NEXT: v_readlane_b32 s15, v39, 0
; GFX908-NEXT: v_mov_b32_e32 v31, v40
+; GFX908-NEXT: v_readlane_b32 s16, v39, 22
; GFX908-NEXT: v_readlane_b32 s17, v39, 23
; GFX908-NEXT: v_readlane_b32 s21, v39, 12
; GFX908-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
index 15f5f890d57b5..d1dee534414ac 100644
--- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
@@ -8,6 +8,7 @@
define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
; CHECK-LABEL: kernel0:
; CHECK: ; %bb.0:
+; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
@@ -19,10 +20,9 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[2:3]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
; CHECK-NEXT: v_writelane_b32 v22, s2, 0
; CHECK-NEXT: v_writelane_b32 v22, s3, 1
+; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[48:51]
; CHECK-NEXT: ;;#ASMEND
@@ -123,19 +123,19 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
; CHECK-NEXT: v_writelane_b32 v22, s0, 58
; CHECK-NEXT: v_writelane_b32 v22, s1, 59
; CHECK-NEXT: v_writelane_b32 v22, s2, 60
+; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
; CHECK-NEXT: v_writelane_b32 v22, s3, 61
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
-; CHECK-NEXT: v_writelane_b32 v22, s0, 62
; CHECK-NEXT: v_writelane_b32 v23, s2, 0
; CHECK-NEXT: v_writelane_b32 v23, s3, 1
; CHECK-NEXT: v_writelane_b32 v23, s4, 2
; CHECK-NEXT: v_writelane_b32 v23, s5, 3
; CHECK-NEXT: v_writelane_b32 v23, s6, 4
-; CHECK-NEXT: v_writelane_b32 v22, s1, 63
+; CHECK-NEXT: v_writelane_b32 v22, s0, 62
; CHECK-NEXT: v_writelane_b32 v23, s7, 5
+; CHECK-NEXT: v_writelane_b32 v22, s1, 63
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:15]
; CHECK-NEXT: ;;#ASMEND
@@ -208,6 +208,9 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_readlane_b32 s0, v22, 2
; CHECK-NEXT: v_readlane_b32 s1, v22, 3
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[48:51]
+; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_readlane_b32 s2, v22, 4
; CHECK-NEXT: v_readlane_b32 s3, v22, 5
; CHECK-NEXT: v_readlane_b32 s4, v22, 6
@@ -215,9 +218,6 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
; CHECK-NEXT: v_readlane_b32 s6, v22, 8
; CHECK-NEXT: v_readlane_b32 s7, v22, 9
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[48:51]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_readlane_b32 s0, v22, 10
@@ -241,29 +241,23 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_readlane_b32 s0, v22, 26
; CHECK-NEXT: v_readlane_b32 s1, v22, 27
-; CHECK-NEXT: v_readlane_b32 s2, v22, 28
-; CHECK-NEXT: v_readlane_b32 s3, v22, 29
-; CHECK-NEXT: v_readlane_b32 s4, v22, 30
-; CHECK-NEXT: v_readlane_b32 s5, v22, 31
-; CHECK-NEXT: v_readlane_b32 s6, v22, 32
-; CHECK-NEXT: v_readlane_b32 s7, v22, 33
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[38:39]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[44:47]
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_readlane_b32 s2, v22, 28
+; CHECK-NEXT: v_readlane_b32 s3, v22, 29
+; CHECK-NEXT: v_readlane_b32 s4, v22, 30
+; CHECK-NEXT: v_readlane_b32 s5, v22, 31
+; CHECK-NEXT: v_readlane_b32 s6, v22, 32
+; CHECK-NEXT: v_readlane_b32 s7, v22, 33
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_readlane_b32 s0, v22, 34
; CHECK-NEXT: v_readlane_b32 s1, v22, 35
-; CHECK-NEXT: v_readlane_b32 s2, v22, 36
-; CHECK-NEXT: v_readlane_b32 s3, v22, 37
-; CHECK-NEXT: v_readlane_b32 s4, v22, 38
-; CHECK-NEXT: v_readlane_b32 s5, v22, 39
-; CHECK-NEXT: v_readlane_b32 s6, v22, 40
-; CHECK-NEXT: v_readlane_b32 s7, v22, 41
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[16:31]
; CHECK-NEXT: ;;#ASMEND
@@ -273,6 +267,12 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[40:43]
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_readlane_b32 s2, v22, 36
+; CHECK-NEXT: v_readlane_b32 s3, v22, 37
+; CHECK-NEXT: v_readlane_b32 s4, v22, 38
+; CHECK-NEXT: v_readlane_b32 s5, v22, 39
+; CHECK-NEXT: v_readlane_b32 s6, v22, 40
+; CHECK-NEXT: v_readlane_b32 s7, v22, 41
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
@@ -297,11 +297,11 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_readlane_b32 s0, v22, 58
; CHECK-NEXT: v_readlane_b32 s1, v22, 59
-; CHECK-NEXT: v_readlane_b32 s2, v22, 60
-; CHECK-NEXT: v_readlane_b32 s3, v22, 61
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[34:35]
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_readlane_b32 s2, v22, 60
+; CHECK-NEXT: v_readlane_b32 s3, v22, 61
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:3]
; CHECK-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 8f8e2c0ba52fc..f196004e7660b 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -276,10 +276,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: .LBB1_24: ; %Flow23
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[38:39], 0x0
-; GLOBALNESS1-NEXT: v_readlane_b32 s70, v57, 8
-; GLOBALNESS1-NEXT: v_readlane_b32 s8, v57, 10
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS1-NEXT: v_readlane_b32 s70, v57, 8
; GLOBALNESS1-NEXT: v_readlane_b32 s71, v57, 9
+; GLOBALNESS1-NEXT: v_readlane_b32 s8, v57, 10
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS1-NEXT: s_mov_b32 s55, s7
; GLOBALNESS1-NEXT: v_readlane_b32 s9, v57, 11
@@ -587,11 +587,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_branch .LBB1_14
; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow23
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_readlane_b32 s84, v57, 8
-; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0
; GLOBALNESS0-NEXT: s_mov_b32 s55, s83
+; GLOBALNESS0-NEXT: v_readlane_b32 s84, v57, 8
; GLOBALNESS0-NEXT: v_readlane_b32 s85, v57, 9
+; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10
; GLOBALNESS0-NEXT: v_readlane_b32 s9, v57, 11
; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow24
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index e9a0671ead4e0..57ddcb20d613c 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -158,7 +158,6 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
; GFX9-O0-NEXT: v_writelane_b32 v5, s34, 2
; GFX9-O0-NEXT: v_writelane_b32 v5, s35, 3
; GFX9-O0-NEXT: s_mov_b32 s34, 0
-; GFX9-O0-NEXT: s_nop 2
; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[36:39], s34
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 72672c8b6efad..6a3a58e3ab120 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -135,7 +135,6 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: v_writelane_b32 v5, s0, 3
; GFX9-O0-NEXT: v_writelane_b32 v5, s1, 4
; GFX9-O0-NEXT: s_mov_b32 s0, 0
-; GFX9-O0-NEXT: s_nop 2
; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], s0
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
@@ -965,7 +964,6 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: v_writelane_b32 v5, s0, 3
; GFX9-O0-NEXT: v_writelane_b32 v5, s1, 4
; GFX9-O0-NEXT: s_mov_b32 s0, 0
-; GFX9-O0-NEXT: s_nop 2
; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], s0
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
More information about the llvm-commits
mailing list