[llvm] AMDGPU/GlobalISel: Restore disabled test (PR #129001)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 26 21:09:04 PST 2025


https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/129001

None

>From 3c3784d2693793fa8fbb9cabe37c6da5c4b9cb69 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 27 Feb 2025 12:08:17 +0700
Subject: [PATCH] AMDGPU/GlobalISel: Restore disabled test

---
 .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll | 887 +++++++++---------
 1 file changed, 421 insertions(+), 466 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 53f6c9543c3e3..074272f7bed86 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -5,7 +5,6 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GFX10_W64 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX11_W32 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GFX11_W64 %s
-; REQUIRES: do-not-run-me
 
 define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f32:
@@ -291,14 +290,14 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double
 define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) {
 ; GFX7-LABEL: test_div_fmas_f32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dword s4, s[2:3], 0xa
-; GFX7-NEXT:    s_load_dword s5, s[2:3], 0x13
-; GFX7-NEXT:    s_load_dword s6, s[2:3], 0x1c
-; GFX7-NEXT:    s_load_dword s7, s[2:3], 0x25
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0xa
+; GFX7-NEXT:    s_load_dword s3, s[4:5], 0x13
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x1c
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x25
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    s_and_b32 s2, 1, s7
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
@@ -311,17 +310,17 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], f
 ;
 ; GFX8-LABEL: test_div_fmas_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x28
-; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x4c
-; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x70
-; GFX8-NEXT:    s_load_dword s5, s[2:3], 0x94
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x28
+; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x4c
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x70
+; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x94
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    s_and_b32 s0, 1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    s_and_b32 s0, 1, s3
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-NEXT:    s_nop 2
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -333,14 +332,14 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], f
 ; GFX10_W32-LABEL: test_div_fmas_f32:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_clause 0x4
-; GFX10_W32-NEXT:    s_load_dword s4, s[2:3], 0x94
-; GFX10_W32-NEXT:    s_load_dword s5, s[2:3], 0x4c
-; GFX10_W32-NEXT:    s_load_dword s6, s[2:3], 0x70
-; GFX10_W32-NEXT:    s_load_dword s7, s[2:3], 0x28
-; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10_W32-NEXT:    s_load_dword s2, s[4:5], 0x94
+; GFX10_W32-NEXT:    s_load_dword s3, s[4:5], 0x4c
+; GFX10_W32-NEXT:    s_load_dword s6, s[4:5], 0x70
+; GFX10_W32-NEXT:    s_load_dword s7, s[4:5], 0x28
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_and_b32 s2, 1, s4
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s7, v0, v1
@@ -351,14 +350,14 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], f
 ; GFX10_W64-LABEL: test_div_fmas_f32:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_clause 0x4
-; GFX10_W64-NEXT:    s_load_dword s4, s[2:3], 0x94
-; GFX10_W64-NEXT:    s_load_dword s5, s[2:3], 0x4c
-; GFX10_W64-NEXT:    s_load_dword s6, s[2:3], 0x70
-; GFX10_W64-NEXT:    s_load_dword s7, s[2:3], 0x28
-; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10_W64-NEXT:    s_load_dword s2, s[4:5], 0x94
+; GFX10_W64-NEXT:    s_load_dword s3, s[4:5], 0x4c
+; GFX10_W64-NEXT:    s_load_dword s6, s[4:5], 0x70
+; GFX10_W64-NEXT:    s_load_dword s7, s[4:5], 0x28
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_and_b32 s2, 1, s4
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s7, v0, v1
@@ -369,40 +368,36 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], f
 ; GFX11_W32-LABEL: test_div_fmas_f32:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_clause 0x4
-; GFX11_W32-NEXT:    s_load_b32 s4, s[2:3], 0x94
-; GFX11_W32-NEXT:    s_load_b32 s5, s[2:3], 0x4c
-; GFX11_W32-NEXT:    s_load_b32 s6, s[2:3], 0x70
-; GFX11_W32-NEXT:    s_load_b32 s7, s[2:3], 0x28
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s2, s[4:5], 0x94
+; GFX11_W32-NEXT:    s_load_b32 s3, s[4:5], 0x4c
+; GFX11_W32-NEXT:    s_load_b32 s6, s[4:5], 0x70
+; GFX11_W32-NEXT:    s_load_b32 s7, s[4:5], 0x28
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    s_and_b32 s2, 1, s4
-; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s6
 ; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
 ; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s7, v0, v1
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11_W32-NEXT:    s_nop 0
-; GFX11_W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W32-NEXT:    s_endpgm
 ;
 ; GFX11_W64-LABEL: test_div_fmas_f32:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_clause 0x4
-; GFX11_W64-NEXT:    s_load_b32 s4, s[2:3], 0x94
-; GFX11_W64-NEXT:    s_load_b32 s5, s[2:3], 0x4c
-; GFX11_W64-NEXT:    s_load_b32 s6, s[2:3], 0x70
-; GFX11_W64-NEXT:    s_load_b32 s7, s[2:3], 0x28
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s2, s[4:5], 0x94
+; GFX11_W64-NEXT:    s_load_b32 s3, s[4:5], 0x4c
+; GFX11_W64-NEXT:    s_load_b32 s6, s[4:5], 0x70
+; GFX11_W64-NEXT:    s_load_b32 s7, s[4:5], 0x28
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    s_and_b32 s2, 1, s4
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s5
+; GFX11_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s7, v0, v1
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d)
   store float %result, ptr addrspace(1) %out, align 4
@@ -412,33 +407,33 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], f
 define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) {
 ; GFX7-LABEL: test_div_fmas_f32_inline_imm_0:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dword s4, s[2:3], 0x13
-; GFX7-NEXT:    s_load_dword s5, s[2:3], 0x1c
-; GFX7-NEXT:    s_load_dword s6, s[2:3], 0x25
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x13
+; GFX7-NEXT:    s_load_dword s3, s[4:5], 0x1c
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x25
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX7-NEXT:    s_and_b32 s2, 1, s6
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_nop 2
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_nop 1
 ; GFX7-NEXT:    v_div_fmas_f32 v0, 1.0, v0, v1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_fmas_f32_inline_imm_0:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x4c
-; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x70
-; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x94
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x4c
+; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x70
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x94
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    s_and_b32 s0, 1, s4
+; GFX8-NEXT:    s_and_b32 s0, 1, s2
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-NEXT:    s_nop 2
 ; GFX8-NEXT:    v_div_fmas_f32 v2, 1.0, v0, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -450,14 +445,14 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out,
 ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_0:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_clause 0x3
-; GFX10_W32-NEXT:    s_load_dword s4, s[2:3], 0x94
-; GFX10_W32-NEXT:    s_load_dword s5, s[2:3], 0x70
-; GFX10_W32-NEXT:    s_load_dword s6, s[2:3], 0x4c
-; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10_W32-NEXT:    s_load_dword s2, s[4:5], 0x94
+; GFX10_W32-NEXT:    s_load_dword s3, s[4:5], 0x70
+; GFX10_W32-NEXT:    s_load_dword s6, s[4:5], 0x4c
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_and_b32 s2, 1, s4
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, 1.0, s6, v0
 ; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -466,14 +461,14 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out,
 ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_clause 0x3
-; GFX10_W64-NEXT:    s_load_dword s4, s[2:3], 0x94
-; GFX10_W64-NEXT:    s_load_dword s5, s[2:3], 0x70
-; GFX10_W64-NEXT:    s_load_dword s6, s[2:3], 0x4c
-; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10_W64-NEXT:    s_load_dword s2, s[4:5], 0x94
+; GFX10_W64-NEXT:    s_load_dword s3, s[4:5], 0x70
+; GFX10_W64-NEXT:    s_load_dword s6, s[4:5], 0x4c
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_and_b32 s2, 1, s4
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, 1.0, s6, v0
 ; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -482,37 +477,33 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out,
 ; GFX11_W32-LABEL: test_div_fmas_f32_inline_imm_0:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_clause 0x3
-; GFX11_W32-NEXT:    s_load_b32 s4, s[2:3], 0x94
-; GFX11_W32-NEXT:    s_load_b32 s5, s[2:3], 0x70
-; GFX11_W32-NEXT:    s_load_b32 s6, s[2:3], 0x4c
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s2, s[4:5], 0x94
+; GFX11_W32-NEXT:    s_load_b32 s3, s[4:5], 0x70
+; GFX11_W32-NEXT:    s_load_b32 s6, s[4:5], 0x4c
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    s_and_b32 s2, 1, s4
-; GFX11_W32-NEXT:    v_mov_b32_e32 v0, s5
+; GFX11_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX11_W32-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
 ; GFX11_W32-NEXT:    v_div_fmas_f32 v0, 1.0, s6, v0
 ; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11_W32-NEXT:    s_nop 0
-; GFX11_W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W32-NEXT:    s_endpgm
 ;
 ; GFX11_W64-LABEL: test_div_fmas_f32_inline_imm_0:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_clause 0x3
-; GFX11_W64-NEXT:    s_load_b32 s4, s[2:3], 0x94
-; GFX11_W64-NEXT:    s_load_b32 s5, s[2:3], 0x70
-; GFX11_W64-NEXT:    s_load_b32 s6, s[2:3], 0x4c
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s2, s[4:5], 0x94
+; GFX11_W64-NEXT:    s_load_b32 s3, s[4:5], 0x70
+; GFX11_W64-NEXT:    s_load_b32 s6, s[4:5], 0x4c
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    s_and_b32 s2, 1, s4
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s5
+; GFX11_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX11_W64-NEXT:    v_div_fmas_f32 v0, 1.0, s6, v0
 ; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d)
   store float %result, ptr addrspace(1) %out, align 4
@@ -522,33 +513,33 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, float %a, float %b, float %c, [8 x i32], i1 %d) {
 ; GFX7-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dword s4, s[2:3], 0x2
-; GFX7-NEXT:    s_load_dword s5, s[2:3], 0x4
-; GFX7-NEXT:    s_load_dword s6, s[2:3], 0xd
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s3, s[4:5], 0x4
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0xd
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX7-NEXT:    s_and_b32 s2, 1, s6
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_nop 2
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_nop 1
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, 1.0, v1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x8
-; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x10
-; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x34
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x10
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    s_and_b32 s0, 1, s4
+; GFX8-NEXT:    s_and_b32 s0, 1, s2
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-NEXT:    s_nop 2
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v0, 1.0, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -560,14 +551,14 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out,
 ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_clause 0x3
-; GFX10_W32-NEXT:    s_load_dword s4, s[2:3], 0x34
-; GFX10_W32-NEXT:    s_load_dword s5, s[2:3], 0x10
-; GFX10_W32-NEXT:    s_load_dword s6, s[2:3], 0x8
-; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10_W32-NEXT:    s_load_dword s2, s[4:5], 0x34
+; GFX10_W32-NEXT:    s_load_dword s3, s[4:5], 0x10
+; GFX10_W32-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_and_b32 s2, 1, s4
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
 ; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -576,14 +567,14 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out,
 ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_clause 0x3
-; GFX10_W64-NEXT:    s_load_dword s4, s[2:3], 0x34
-; GFX10_W64-NEXT:    s_load_dword s5, s[2:3], 0x10
-; GFX10_W64-NEXT:    s_load_dword s6, s[2:3], 0x8
-; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10_W64-NEXT:    s_load_dword s2, s[4:5], 0x34
+; GFX10_W64-NEXT:    s_load_dword s3, s[4:5], 0x10
+; GFX10_W64-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_and_b32 s2, 1, s4
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
 ; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -592,37 +583,33 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out,
 ; GFX11_W32-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_clause 0x3
-; GFX11_W32-NEXT:    s_load_b32 s4, s[2:3], 0x34
-; GFX11_W32-NEXT:    s_load_b32 s5, s[2:3], 0x10
-; GFX11_W32-NEXT:    s_load_b32 s6, s[2:3], 0x8
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s2, s[4:5], 0x34
+; GFX11_W32-NEXT:    s_load_b32 s3, s[4:5], 0x10
+; GFX11_W32-NEXT:    s_load_b32 s6, s[4:5], 0x8
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    s_and_b32 s2, 1, s4
-; GFX11_W32-NEXT:    v_mov_b32_e32 v0, s5
+; GFX11_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX11_W32-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
 ; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
 ; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11_W32-NEXT:    s_nop 0
-; GFX11_W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W32-NEXT:    s_endpgm
 ;
 ; GFX11_W64-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_clause 0x3
-; GFX11_W64-NEXT:    s_load_b32 s4, s[2:3], 0x34
-; GFX11_W64-NEXT:    s_load_b32 s5, s[2:3], 0x10
-; GFX11_W64-NEXT:    s_load_b32 s6, s[2:3], 0x8
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s2, s[4:5], 0x34
+; GFX11_W64-NEXT:    s_load_b32 s3, s[4:5], 0x10
+; GFX11_W64-NEXT:    s_load_b32 s6, s[4:5], 0x8
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    s_and_b32 s2, 1, s4
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s5
+; GFX11_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
 ; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d)
   store float %result, ptr addrspace(1) %out, align 4
@@ -632,33 +619,33 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) {
 ; GFX7-LABEL: test_div_fmas_f32_inline_imm_2:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dword s4, s[2:3], 0xa
-; GFX7-NEXT:    s_load_dword s5, s[2:3], 0x13
-; GFX7-NEXT:    s_load_dword s6, s[2:3], 0x25
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0xa
+; GFX7-NEXT:    s_load_dword s3, s[4:5], 0x13
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x25
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX7-NEXT:    s_and_b32 s2, 1, s6
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_nop 2
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_nop 1
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, 1.0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_fmas_f32_inline_imm_2:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x28
-; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x4c
-; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x94
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x28
+; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x4c
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x94
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    s_and_b32 s0, 1, s4
+; GFX8-NEXT:    s_and_b32 s0, 1, s2
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-NEXT:    s_nop 2
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, 1.0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -670,14 +657,14 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out,
 ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_2:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_clause 0x3
-; GFX10_W32-NEXT:    s_load_dword s4, s[2:3], 0x94
-; GFX10_W32-NEXT:    s_load_dword s5, s[2:3], 0x4c
-; GFX10_W32-NEXT:    s_load_dword s6, s[2:3], 0x28
-; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10_W32-NEXT:    s_load_dword s2, s[4:5], 0x94
+; GFX10_W32-NEXT:    s_load_dword s3, s[4:5], 0x4c
+; GFX10_W32-NEXT:    s_load_dword s6, s[4:5], 0x28
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_and_b32 s2, 1, s4
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s6, v0, 1.0
 ; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -686,14 +673,14 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out,
 ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_clause 0x3
-; GFX10_W64-NEXT:    s_load_dword s4, s[2:3], 0x94
-; GFX10_W64-NEXT:    s_load_dword s5, s[2:3], 0x4c
-; GFX10_W64-NEXT:    s_load_dword s6, s[2:3], 0x28
-; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10_W64-NEXT:    s_load_dword s2, s[4:5], 0x94
+; GFX10_W64-NEXT:    s_load_dword s3, s[4:5], 0x4c
+; GFX10_W64-NEXT:    s_load_dword s6, s[4:5], 0x28
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_and_b32 s2, 1, s4
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s6, v0, 1.0
 ; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -702,37 +689,33 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out,
 ; GFX11_W32-LABEL: test_div_fmas_f32_inline_imm_2:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_clause 0x3
-; GFX11_W32-NEXT:    s_load_b32 s4, s[2:3], 0x94
-; GFX11_W32-NEXT:    s_load_b32 s5, s[2:3], 0x4c
-; GFX11_W32-NEXT:    s_load_b32 s6, s[2:3], 0x28
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s2, s[4:5], 0x94
+; GFX11_W32-NEXT:    s_load_b32 s3, s[4:5], 0x4c
+; GFX11_W32-NEXT:    s_load_b32 s6, s[4:5], 0x28
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    s_and_b32 s2, 1, s4
-; GFX11_W32-NEXT:    v_mov_b32_e32 v0, s5
+; GFX11_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX11_W32-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
 ; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s6, v0, 1.0
 ; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11_W32-NEXT:    s_nop 0
-; GFX11_W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W32-NEXT:    s_endpgm
 ;
 ; GFX11_W64-LABEL: test_div_fmas_f32_inline_imm_2:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_clause 0x3
-; GFX11_W64-NEXT:    s_load_b32 s4, s[2:3], 0x94
-; GFX11_W64-NEXT:    s_load_b32 s5, s[2:3], 0x4c
-; GFX11_W64-NEXT:    s_load_b32 s6, s[2:3], 0x28
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s2, s[4:5], 0x94
+; GFX11_W64-NEXT:    s_load_b32 s3, s[4:5], 0x4c
+; GFX11_W64-NEXT:    s_load_b32 s6, s[4:5], 0x28
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    s_and_b32 s2, 1, s4
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s5
+; GFX11_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s6, v0, 1.0
 ; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d)
   store float %result, ptr addrspace(1) %out, align 4
@@ -742,83 +725,83 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) {
 ; GFX7-LABEL: test_div_fmas_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x8
+; GFX7-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_mov_b32_e32 v2, s8
-; GFX7-NEXT:    v_mov_b32_e32 v4, s10
+; GFX7-NEXT:    v_mov_b32_e32 v0, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s12
+; GFX7-NEXT:    v_mov_b32_e32 v4, s14
 ; GFX7-NEXT:    s_and_b32 s0, 1, s0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    v_mov_b32_e32 v3, s9
-; GFX7-NEXT:    v_mov_b32_e32 v5, s11
+; GFX7-NEXT:    v_mov_b32_e32 v1, s11
+; GFX7-NEXT:    v_mov_b32_e32 v3, s13
+; GFX7-NEXT:    v_mov_b32_e32 v5, s15
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX7-NEXT:    s_mov_b32 s6, -1
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s10, -1
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_nop 1
 ; GFX7-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_fmas_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x20
+; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x20
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_mov_b32_e32 v2, s8
-; GFX8-NEXT:    v_mov_b32_e32 v4, s10
+; GFX8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX8-NEXT:    v_mov_b32_e32 v4, s14
 ; GFX8-NEXT:    s_and_b32 s0, 1, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    v_mov_b32_e32 v3, s9
-; GFX8-NEXT:    v_mov_b32_e32 v5, s11
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    v_mov_b32_e32 v3, s13
+; GFX8-NEXT:    v_mov_b32_e32 v5, s15
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX8-NEXT:    s_nop 3
 ; GFX8-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX8-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10_W32-LABEL: test_div_fmas_f64:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_clause 0x1
-; GFX10_W32-NEXT:    s_load_dword s0, s[2:3], 0x20
-; GFX10_W32-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX10_W32-NEXT:    s_load_dword s0, s[4:5], 0x20
+; GFX10_W32-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W32-NEXT:    s_and_b32 s0, 1, s0
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10_W32-NEXT:    v_mov_b32_e32 v2, s10
+; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s12
+; GFX10_W32-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s11
-; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3]
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s13
+; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s15
+; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3]
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10_W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10_W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f64:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_clause 0x1
-; GFX10_W64-NEXT:    s_load_dword s0, s[2:3], 0x20
-; GFX10_W64-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX10_W64-NEXT:    s_load_dword s0, s[4:5], 0x20
+; GFX10_W64-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W64-NEXT:    s_and_b32 s0, 1, s0
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10_W64-NEXT:    v_mov_b32_e32 v2, s10
+; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s12
+; GFX10_W64-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10_W64-NEXT:    v_mov_b32_e32 v3, s11
-; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3]
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s13
+; GFX10_W64-NEXT:    v_mov_b32_e32 v3, s15
+; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3]
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10_W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10_W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX10_W64-NEXT:    s_endpgm
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f64:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_clause 0x1
-; GFX11_W32-NEXT:    s_load_b32 s8, s[2:3], 0x20
-; GFX11_W32-NEXT:    s_load_b256 s[0:7], s[2:3], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s8, s[4:5], 0x20
+; GFX11_W32-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W32-NEXT:    s_and_b32 s8, 1, s8
 ; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
@@ -827,15 +810,13 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d
 ; GFX11_W32-NEXT:    v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3]
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11_W32-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11_W32-NEXT:    s_nop 0
-; GFX11_W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W32-NEXT:    s_endpgm
 ;
 ; GFX11_W64-LABEL: test_div_fmas_f64:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_clause 0x1
-; GFX11_W64-NEXT:    s_load_b32 s8, s[2:3], 0x20
-; GFX11_W64-NEXT:    s_load_b256 s[0:7], s[2:3], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s8, s[4:5], 0x20
+; GFX11_W64-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W64-NEXT:    s_and_b32 s8, 1, s8
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s4
@@ -846,8 +827,6 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d
 ; GFX11_W64-NEXT:    v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3]
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11_W64-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d)
   store double %result, ptr addrspace(1) %out, align 8
@@ -857,35 +836,35 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d
 define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c, i32 %i) {
 ; GFX7-LABEL: test_div_fmas_f32_cond_to_vcc:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x2
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_cmp_eq_u32 s7, 0
-; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX7-NEXT:    s_and_b32 s2, 1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_nop 2
+; GFX7-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    s_and_b32 s0, 1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX7-NEXT:    s_nop 3
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
-; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_fmas_f32_cond_to_vcc:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x8
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_cmp_eq_u32 s7, 0
-; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX8-NEXT:    s_and_b32 s0, 1, s0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    s_and_b32 s0, 1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-NEXT:    s_nop 2
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -897,72 +876,68 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out,
 ; GFX10_W32-LABEL: test_div_fmas_f32_cond_to_vcc:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_clause 0x1
-; GFX10_W32-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x8
-; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10_W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX10_W32-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_cmp_eq_u32 s7, 0
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10_W32-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
+; GFX10_W32-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s1
+; GFX10_W32-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10_W32-NEXT:    s_and_b32 s3, 1, s3
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s0, v0, v1
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_cond_to_vcc:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_clause 0x1
-; GFX10_W64-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x8
-; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10_W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX10_W64-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_cmp_eq_u32 s7, 0
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10_W64-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
+; GFX10_W64-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s1
+; GFX10_W64-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10_W64-NEXT:    s_and_b32 s3, 1, s3
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s0, v0, v1
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX10_W64-NEXT:    s_endpgm
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f32_cond_to_vcc:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_clause 0x1
-; GFX11_W32-NEXT:    s_load_b128 s[4:7], s[2:3], 0x8
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11_W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
+; GFX11_W32-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    s_cmp_eq_u32 s7, 0
-; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11_W32-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11_W32-NEXT:    s_and_b32 s2, 1, s2
-; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
+; GFX11_W32-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s2
+; GFX11_W32-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX11_W32-NEXT:    s_and_b32 s3, 1, s3
+; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s0, v0, v1
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11_W32-NEXT:    s_nop 0
-; GFX11_W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11_W32-NEXT:    s_endpgm
 ;
 ; GFX11_W64-LABEL: test_div_fmas_f32_cond_to_vcc:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_clause 0x1
-; GFX11_W64-NEXT:    s_load_b128 s[4:7], s[2:3], 0x8
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11_W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
+; GFX11_W64-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    s_cmp_eq_u32 s7, 0
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s5
-; GFX11_W64-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11_W64-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11_W64-NEXT:    s_and_b32 s2, 1, s2
-; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
+; GFX11_W64-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11_W64-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX11_W64-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11_W64-NEXT:    s_and_b32 s3, 1, s3
+; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s0, v0, v1
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11_W64-NEXT:    s_endpgm
   %cmp = icmp eq i32 %i, 0
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp)
@@ -973,14 +948,14 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) {
 ; GFX7-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dword s4, s[2:3], 0xa
-; GFX7-NEXT:    s_load_dword s5, s[2:3], 0x13
-; GFX7-NEXT:    s_load_dword s6, s[2:3], 0x1c
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0xa
+; GFX7-NEXT:    s_load_dword s3, s[4:5], 0x13
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x1c
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX7-NEXT:    s_mov_b64 vcc, 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -990,16 +965,16 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace
 ;
 ; GFX8-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x28
-; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x4c
-; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x70
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x28
+; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x4c
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x70
 ; GFX8-NEXT:    s_mov_b64 vcc, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -1009,14 +984,14 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace
 ; GFX10_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_clause 0x3
-; GFX10_W32-NEXT:    s_load_dword s4, s[2:3], 0x4c
-; GFX10_W32-NEXT:    s_load_dword s5, s[2:3], 0x70
-; GFX10_W32-NEXT:    s_load_dword s6, s[2:3], 0x28
-; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10_W32-NEXT:    s_load_dword s2, s[4:5], 0x4c
+; GFX10_W32-NEXT:    s_load_dword s3, s[4:5], 0x70
+; GFX10_W32-NEXT:    s_load_dword s6, s[4:5], 0x28
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W32-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -1025,14 +1000,14 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace
 ; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_clause 0x3
-; GFX10_W64-NEXT:    s_load_dword s4, s[2:3], 0x4c
-; GFX10_W64-NEXT:    s_load_dword s5, s[2:3], 0x70
-; GFX10_W64-NEXT:    s_load_dword s6, s[2:3], 0x28
-; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10_W64-NEXT:    s_load_dword s2, s[4:5], 0x4c
+; GFX10_W64-NEXT:    s_load_dword s3, s[4:5], 0x70
+; GFX10_W64-NEXT:    s_load_dword s6, s[4:5], 0x28
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W64-NEXT:    s_mov_b64 vcc, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -1041,36 +1016,32 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace
 ; GFX11_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_clause 0x3
-; GFX11_W32-NEXT:    s_load_b32 s4, s[2:3], 0x4c
-; GFX11_W32-NEXT:    s_load_b32 s5, s[2:3], 0x70
-; GFX11_W32-NEXT:    s_load_b32 s6, s[2:3], 0x28
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s2, s[4:5], 0x4c
+; GFX11_W32-NEXT:    s_load_b32 s3, s[4:5], 0x70
+; GFX11_W32-NEXT:    s_load_b32 s6, s[4:5], 0x28
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W32-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11_W32-NEXT:    s_nop 0
-; GFX11_W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W32-NEXT:    s_endpgm
 ;
 ; GFX11_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_clause 0x3
-; GFX11_W64-NEXT:    s_load_b32 s4, s[2:3], 0x4c
-; GFX11_W64-NEXT:    s_load_b32 s5, s[2:3], 0x70
-; GFX11_W64-NEXT:    s_load_b32 s6, s[2:3], 0x28
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s2, s[4:5], 0x4c
+; GFX11_W64-NEXT:    s_load_b32 s3, s[4:5], 0x70
+; GFX11_W64-NEXT:    s_load_b32 s6, s[4:5], 0x28
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W64-NEXT:    s_mov_b64 vcc, 0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11_W64-NEXT:    v_mov_b32_e32 v1, s5
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11_W64-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false)
   store float %result, ptr addrspace(1) %out, align 4
@@ -1080,14 +1051,14 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace
 define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) {
 ; GFX7-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dword s4, s[2:3], 0xa
-; GFX7-NEXT:    s_load_dword s5, s[2:3], 0x13
-; GFX7-NEXT:    s_load_dword s6, s[2:3], 0x1c
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0xa
+; GFX7-NEXT:    s_load_dword s3, s[4:5], 0x13
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x1c
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX7-NEXT:    s_mov_b64 vcc, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX7-NEXT:    s_mov_b32 s2, -1
@@ -1097,16 +1068,16 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(
 ;
 ; GFX8-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x28
-; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x4c
-; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x70
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x28
+; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x4c
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x70
 ; GFX8-NEXT:    s_mov_b64 vcc, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -1116,14 +1087,14 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(
 ; GFX10_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_clause 0x3
-; GFX10_W32-NEXT:    s_load_dword s4, s[2:3], 0x4c
-; GFX10_W32-NEXT:    s_load_dword s5, s[2:3], 0x70
-; GFX10_W32-NEXT:    s_load_dword s6, s[2:3], 0x28
-; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10_W32-NEXT:    s_load_dword s2, s[4:5], 0x4c
+; GFX10_W32-NEXT:    s_load_dword s3, s[4:5], 0x70
+; GFX10_W32-NEXT:    s_load_dword s6, s[4:5], 0x28
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W32-NEXT:    s_mov_b32 vcc_lo, -1
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -1132,14 +1103,14 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(
 ; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_clause 0x3
-; GFX10_W64-NEXT:    s_load_dword s4, s[2:3], 0x4c
-; GFX10_W64-NEXT:    s_load_dword s5, s[2:3], 0x70
-; GFX10_W64-NEXT:    s_load_dword s6, s[2:3], 0x28
-; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10_W64-NEXT:    s_load_dword s2, s[4:5], 0x4c
+; GFX10_W64-NEXT:    s_load_dword s3, s[4:5], 0x70
+; GFX10_W64-NEXT:    s_load_dword s6, s[4:5], 0x28
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W64-NEXT:    s_mov_b64 vcc, -1
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -1148,36 +1119,32 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(
 ; GFX11_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_clause 0x3
-; GFX11_W32-NEXT:    s_load_b32 s4, s[2:3], 0x4c
-; GFX11_W32-NEXT:    s_load_b32 s5, s[2:3], 0x70
-; GFX11_W32-NEXT:    s_load_b32 s6, s[2:3], 0x28
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s2, s[4:5], 0x4c
+; GFX11_W32-NEXT:    s_load_b32 s3, s[4:5], 0x70
+; GFX11_W32-NEXT:    s_load_b32 s6, s[4:5], 0x28
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W32-NEXT:    s_mov_b32 vcc_lo, -1
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11_W32-NEXT:    s_nop 0
-; GFX11_W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W32-NEXT:    s_endpgm
 ;
 ; GFX11_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_clause 0x3
-; GFX11_W64-NEXT:    s_load_b32 s4, s[2:3], 0x4c
-; GFX11_W64-NEXT:    s_load_b32 s5, s[2:3], 0x70
-; GFX11_W64-NEXT:    s_load_b32 s6, s[2:3], 0x28
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s2, s[4:5], 0x4c
+; GFX11_W64-NEXT:    s_load_b32 s3, s[4:5], 0x70
+; GFX11_W64-NEXT:    s_load_b32 s6, s[4:5], 0x28
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W64-NEXT:    s_mov_b64 vcc, -1
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11_W64-NEXT:    v_mov_b32_e32 v1, s5
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11_W64-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true)
   store float %result, ptr addrspace(1) %out, align 4
@@ -1187,40 +1154,40 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(
 define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %d) {
 ; GFX7-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
-; GFX7-NEXT:    s_load_dword s0, s[2:3], 0xc
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0xc
+; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
-; GFX7-NEXT:    buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_mov_b64 s[0:1], s[10:11]
+; GFX7-NEXT:    buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_load_dword v4, v[1:2], s[4:7], 0 addr64 offset:4 glc
+; GFX7-NEXT:    buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:4 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 offset:8 glc
+; GFX7-NEXT:    buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 offset:8 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX7-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX7-NEXT:    s_and_b32 s0, 1, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_and_b64 vcc, vcc, s[0:1]
-; GFX7-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v3, v4, v1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:8
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x30
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x30
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v1
@@ -1233,9 +1200,9 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_add_u32 s0, s4, 8
-; GFX8-NEXT:    s_addc_u32 s1, s5, 0
-; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX8-NEXT:    s_add_u32 s0, s0, 8
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_and_b32 s2, 1, s2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -1250,104 +1217,96 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
 ;
 ; GFX10_W32-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
 ; GFX10_W32:       ; %bb.0:
-; GFX10_W32-NEXT:    s_clause 0x1
-; GFX10_W32-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX10_W32-NEXT:    s_load_dword s0, s[2:3], 0x30
+; GFX10_W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX10_W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX10_W32-NEXT:    s_load_dword s4, s[4:5], 0x30
 ; GFX10_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    global_load_dword v2, v1, s[6:7] glc dlc
+; GFX10_W32-NEXT:    global_load_dword v2, v1, s[2:3] glc dlc
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_W32-NEXT:    global_load_dword v3, v1, s[6:7] offset:4 glc dlc
+; GFX10_W32-NEXT:    global_load_dword v3, v1, s[2:3] offset:4 glc dlc
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_W32-NEXT:    global_load_dword v4, v1, s[6:7] offset:8 glc dlc
+; GFX10_W32-NEXT:    global_load_dword v4, v1, s[2:3] offset:8 glc dlc
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_W32-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10_W32-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10_W32-NEXT:    s_and_b32 s0, 1, s0
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
-; GFX10_W32-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX10_W32-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX10_W32-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 s2, 0, s2
+; GFX10_W32-NEXT:    s_and_b32 vcc_lo, vcc_lo, s2
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v2, v3, v4
-; GFX10_W32-NEXT:    global_store_dword v1, v0, s[4:5] offset:8
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1] offset:8
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
 ; GFX10_W64:       ; %bb.0:
-; GFX10_W64-NEXT:    s_clause 0x1
-; GFX10_W64-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX10_W64-NEXT:    s_load_dword s0, s[2:3], 0x30
+; GFX10_W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX10_W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX10_W64-NEXT:    s_load_dword s4, s[4:5], 0x30
 ; GFX10_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    global_load_dword v2, v1, s[6:7] glc dlc
+; GFX10_W64-NEXT:    global_load_dword v2, v1, s[2:3] glc dlc
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_W64-NEXT:    global_load_dword v3, v1, s[6:7] offset:4 glc dlc
+; GFX10_W64-NEXT:    global_load_dword v3, v1, s[2:3] offset:4 glc dlc
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_W64-NEXT:    global_load_dword v4, v1, s[6:7] offset:8 glc dlc
+; GFX10_W64-NEXT:    global_load_dword v4, v1, s[2:3] offset:8 glc dlc
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_W64-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10_W64-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10_W64-NEXT:    s_and_b32 s0, 1, s0
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX10_W64-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX10_W64-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX10_W64-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, s2
+; GFX10_W64-NEXT:    s_and_b64 vcc, vcc, s[2:3]
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v2, v3, v4
-; GFX10_W64-NEXT:    global_store_dword v1, v0, s[4:5] offset:8
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1] offset:8
 ; GFX10_W64-NEXT:    s_endpgm
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
 ; GFX11_W32:       ; %bb.0:
-; GFX11_W32-NEXT:    s_clause 0x1
-; GFX11_W32-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
-; GFX11_W32-NEXT:    s_load_b32 s0, s[2:3], 0x30
+; GFX11_W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11_W32-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11_W32-NEXT:    s_load_b32 s4, s[4:5], 0x30
 ; GFX11_W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX11_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    global_load_b32 v2, v1, s[6:7] glc dlc
+; GFX11_W32-NEXT:    global_load_b32 v2, v1, s[2:3] glc dlc
 ; GFX11_W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX11_W32-NEXT:    global_load_b32 v3, v1, s[6:7] offset:4 glc dlc
+; GFX11_W32-NEXT:    global_load_b32 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11_W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX11_W32-NEXT:    global_load_b32 v1, v1, s[6:7] offset:8 glc dlc
+; GFX11_W32-NEXT:    global_load_b32 v1, v1, s[2:3] offset:8 glc dlc
 ; GFX11_W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX11_W32-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX11_W32-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX11_W32-NEXT:    s_and_b32 s0, 1, s0
-; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
-; GFX11_W32-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; GFX11_W32-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11_W32-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 s2, 0, s2
+; GFX11_W32-NEXT:    s_and_b32 vcc_lo, vcc_lo, s2
 ; GFX11_W32-NEXT:    v_div_fmas_f32 v0, v2, v3, v1
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[4:5] offset:8
-; GFX11_W32-NEXT:    s_nop 0
-; GFX11_W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1] offset:8
 ; GFX11_W32-NEXT:    s_endpgm
 ;
 ; GFX11_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
 ; GFX11_W64:       ; %bb.0:
-; GFX11_W64-NEXT:    s_clause 0x1
-; GFX11_W64-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
-; GFX11_W64-NEXT:    s_load_b32 s0, s[2:3], 0x30
+; GFX11_W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11_W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11_W64-NEXT:    s_load_b32 s4, s[4:5], 0x30
 ; GFX11_W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX11_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    global_load_b32 v2, v1, s[6:7] glc dlc
+; GFX11_W64-NEXT:    global_load_b32 v2, v1, s[2:3] glc dlc
 ; GFX11_W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX11_W64-NEXT:    global_load_b32 v3, v1, s[6:7] offset:4 glc dlc
+; GFX11_W64-NEXT:    global_load_b32 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11_W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX11_W64-NEXT:    global_load_b32 v1, v1, s[6:7] offset:8 glc dlc
+; GFX11_W64-NEXT:    global_load_b32 v1, v1, s[2:3] offset:8 glc dlc
 ; GFX11_W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX11_W64-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX11_W64-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX11_W64-NEXT:    s_and_b32 s0, 1, s0
-; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX11_W64-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; GFX11_W64-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11_W64-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, s2
+; GFX11_W64-NEXT:    s_and_b64 vcc, vcc, s[2:3]
 ; GFX11_W64-NEXT:    v_div_fmas_f32 v0, v2, v3, v1
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[4:5] offset:8
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1] offset:8
 ; GFX11_W64-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.a = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -1371,19 +1330,19 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
 define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [8 x i32], ptr addrspace(1) %in, [8 x i32], ptr addrspace(1) %dummy) {
 ; GFX7-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xa
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_dwordx3 v[1:3], v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx3 v[1:3], v[1:2], s[0:3], 0 addr64
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
 ; GFX7-NEXT:    s_mov_b64 vcc, 0
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX7-NEXT:    s_and_saveexec_b64 s[6:7], s[0:1]
 ; GFX7-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX7-NEXT:  ; %bb.1: ; %bb
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x14
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x14
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1395,18 +1354,18 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX7-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
 ; GFX7-NEXT:    s_or_b64 vcc, s[8:9], s[0:1]
 ; GFX7-NEXT:  .LBB13_2: ; %exit
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x28
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x28
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
@@ -1416,10 +1375,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX8-NEXT:    flat_load_dwordx3 v[1:3], v[1:2]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
 ; GFX8-NEXT:    s_mov_b64 vcc, 0
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX8-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX8-NEXT:  ; %bb.1: ; %bb
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x50
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x50
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1431,10 +1390,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX8-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
 ; GFX8-NEXT:    s_or_b64 vcc, s[6:7], s[0:1]
 ; GFX8-NEXT:  .LBB13_2: ; %exit
-; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_add_u32 s0, s0, 8
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
@@ -1445,7 +1404,7 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ;
 ; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX10_W32:       ; %bb.0: ; %entry
-; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x28
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x28
 ; GFX10_W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX10_W32-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1454,20 +1413,20 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX10_W32-NEXT:    s_and_saveexec_b32 s1, s0
 ; GFX10_W32-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX10_W32-NEXT:  ; %bb.1: ; %bb
-; GFX10_W32-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x50
+; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x50
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX10_W32-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W32-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10_W32-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10_W32-NEXT:    s_andn2_b32 s4, 0, exec_lo
+; GFX10_W32-NEXT:    s_andn2_b32 s2, 0, exec_lo
 ; GFX10_W32-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX10_W32-NEXT:    s_and_b32 s0, exec_lo, s0
-; GFX10_W32-NEXT:    s_or_b32 vcc_lo, s4, s0
+; GFX10_W32-NEXT:    s_or_b32 vcc_lo, s2, s0
 ; GFX10_W32-NEXT:  .LBB13_2: ; %exit
 ; GFX10_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
@@ -1477,16 +1436,16 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX10_W64:       ; %bb.0: ; %entry
-; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x28
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x28
 ; GFX10_W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX10_W64-NEXT:    s_mov_b64 vcc, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W64-NEXT:    global_load_dwordx3 v[1:3], v1, s[0:1]
 ; GFX10_W64-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
-; GFX10_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX10_W64-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX10_W64-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX10_W64-NEXT:  ; %bb.1: ; %bb
-; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x50
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x50
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W64-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1498,8 +1457,8 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX10_W64-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
 ; GFX10_W64-NEXT:    s_or_b64 vcc, s[6:7], s[0:1]
 ; GFX10_W64-NEXT:  .LBB13_2: ; %exit
-; GFX10_W64-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10_W64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
@@ -1509,7 +1468,7 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX11_W32:       ; %bb.0: ; %entry
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x28
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x28
 ; GFX11_W32-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
 ; GFX11_W32-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11_W32-NEXT:    v_lshlrev_b32_e32 v0, 2, v3
@@ -1519,42 +1478,40 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX11_W32-NEXT:    v_cmpx_eq_u32_e32 0, v3
 ; GFX11_W32-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX11_W32-NEXT:  ; %bb.1: ; %bb
-; GFX11_W32-NEXT:    s_load_b64 s[4:5], s[2:3], 0x50
+; GFX11_W32-NEXT:    s_load_b64 s[2:3], s[4:5], 0x50
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W32-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11_W32-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX11_W32-NEXT:    s_and_not1_b32 s4, 0, exec_lo
+; GFX11_W32-NEXT:    s_and_not1_b32 s2, 0, exec_lo
 ; GFX11_W32-NEXT:    s_and_b32 s0, 1, s0
 ; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX11_W32-NEXT:    s_and_b32 s0, exec_lo, s0
-; GFX11_W32-NEXT:    s_or_b32 vcc_lo, s4, s0
+; GFX11_W32-NEXT:    s_or_b32 vcc_lo, s2, s0
 ; GFX11_W32-NEXT:  .LBB13_2: ; %exit
 ; GFX11_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11_W32-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1] offset:8
-; GFX11_W32-NEXT:    s_nop 0
-; GFX11_W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W32-NEXT:    s_endpgm
 ;
 ; GFX11_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX11_W64:       ; %bb.0: ; %entry
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x28
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x28
 ; GFX11_W64-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
 ; GFX11_W64-NEXT:    s_mov_b64 vcc, 0
-; GFX11_W64-NEXT:    s_mov_b64 s[4:5], exec
+; GFX11_W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX11_W64-NEXT:    v_lshlrev_b32_e32 v0, 2, v3
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W64-NEXT:    global_load_b96 v[0:2], v0, s[0:1]
 ; GFX11_W64-NEXT:    v_cmpx_eq_u32_e32 0, v3
 ; GFX11_W64-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX11_W64-NEXT:  ; %bb.1: ; %bb
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x50
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x50
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W64-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1566,15 +1523,13 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX11_W64-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
 ; GFX11_W64-NEXT:    s_or_b64 vcc, s[6:7], s[0:1]
 ; GFX11_W64-NEXT:  .LBB13_2: ; %exit
-; GFX11_W64-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11_W64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11_W64-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1] offset:8
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()



More information about the llvm-commits mailing list