[llvm] [AMDGPU] Update codegen tests for PR #154069 (PR #154862)

Fri Aug 22 00:19:19 PDT 2025

https://github.com/gandhi56 updated https://github.com/llvm/llvm-project/pull/154862

>From 5d0483e44cce7c38e6fae4f3338ae4045d61dae5 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <Anshil.Gandhi at amd.com>
Date: Thu, 21 Aug 2025 16:29:32 -0500
Subject: [PATCH] [AMDGPU] Update codegen tests for PR #154069

---
 .../AMDGPU/GlobalISel/amdgpu-irtranslator.ll  |  15 +-
 llvm/test/CodeGen/AMDGPU/add_i64.ll           | 137 +++--
 .../CodeGen/AMDGPU/atomic_cmp_swap_local.ll   | 489 +++++++++++++++---
 .../CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll    | 275 +++++++---
 4 files changed, 723 insertions(+), 193 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
index 523d51ddcd2bc..fc236147f1238 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
@@ -1,12 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel %s -o - 2>&1 | FileCheck %s
 
 ; This file checks that the translation from llvm IR to generic
 ; MachineInstr is correct.
 
 ; Tests for add.
-; CHECK: name: addi32
-; CHECK: {{%[0-9]+}}:_(s32) = G_ADD
-define amdgpu_kernel void @addi32(i32 %arg1, i32 %arg2) {
+define void @addi32(i32 %arg1, i32 %arg2) {
+  ; CHECK-LABEL: name: addi32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+  ; CHECK-NEXT:   SI_RETURN
   %res = add i32 %arg1, %arg2
   store i32 %res, ptr addrspace(1) poison
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/add_i64.ll b/llvm/test/CodeGen/AMDGPU/add_i64.ll
index eedd56de8c7c2..a8560e8498c4d 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i64.ll
@@ -1,12 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s
 
-
-declare i32 @llvm.amdgcn.workitem.id.x() readnone
-
-; SI-LABEL: {{^}}test_i64_vreg:
-; SI: v_add_i32
-; SI: v_addc_u32
 define amdgpu_kernel void @test_i64_vreg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
+; SI-LABEL: test_i64_vreg:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s14, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_mov_b64 s[6:7], s[14:15]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[12:15], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid
   %b_ptr = getelementptr i64, ptr addrspace(1) %inB, i32 %tid
@@ -18,10 +35,22 @@ define amdgpu_kernel void @test_i64_vreg(ptr addrspace(1) noalias %out, ptr addr
 }
 
 ; Check that the SGPR add operand is correctly moved to a VGPR.
-; SI-LABEL: {{^}}sgpr_operand:
-; SI: s_add_u32
-; SI: s_addc_u32
 define amdgpu_kernel void @sgpr_operand(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in_bar, i64 %a) {
+; SI-LABEL: sgpr_operand:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xf
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_add_u32 s4, s6, s4
+; SI-NEXT:    s_addc_u32 s5, s7, s5
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %foo = load i64, ptr addrspace(1) %in, align 8
   %result = add i64 %foo, %a
   store i64 %result, ptr addrspace(1) %out
@@ -30,35 +59,76 @@ define amdgpu_kernel void @sgpr_operand(ptr addrspace(1) noalias %out, ptr addrs
 
 ; Swap the arguments. Check that the SGPR -> VGPR copy works with the
 ; SGPR as other operand.
-;
-; SI-LABEL: {{^}}sgpr_operand_reversed:
-; SI: s_add_u32
-; SI: s_addc_u32
 define amdgpu_kernel void @sgpr_operand_reversed(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %a) {
+; SI-LABEL: sgpr_operand_reversed:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_add_u32 s4, s4, s6
+; SI-NEXT:    s_addc_u32 s5, s5, s7
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %foo = load i64, ptr addrspace(1) %in, align 8
   %result = add i64 %a, %foo
   store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-
-; SI-LABEL: {{^}}test_v2i64_sreg:
-; SI: s_add_u32
-; SI: s_addc_u32
-; SI: s_add_u32
-; SI: s_addc_u32
 define amdgpu_kernel void @test_v2i64_sreg(ptr addrspace(1) noalias %out, <2 x i64> %a, <2 x i64> %b) {
+; SI-LABEL: test_v2i64_sreg:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_add_u32 s4, s10, s14
+; SI-NEXT:    s_addc_u32 s5, s11, s15
+; SI-NEXT:    s_add_u32 s6, s8, s12
+; SI-NEXT:    s_addc_u32 s7, s9, s13
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    v_mov_b32_e32 v3, s5
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %result = add <2 x i64> %a, %b
   store <2 x i64> %result, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}test_v2i64_vreg:
-; SI: v_add_i32
-; SI: v_addc_u32
-; SI: v_add_i32
-; SI: v_addc_u32
 define amdgpu_kernel void @test_v2i64_vreg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
+; SI-LABEL: test_v2i64_vreg:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s14, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; SI-NEXT:    v_mov_b32_e32 v5, 0
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_mov_b64 s[6:7], s[14:15]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[12:15], 0 addr64
+; SI-NEXT:    buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid
   %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inB, i32 %tid
@@ -69,14 +139,19 @@ define amdgpu_kernel void @test_v2i64_vreg(ptr addrspace(1) noalias %out, ptr ad
   ret void
 }
 
-; SI-LABEL: {{^}}trunc_i64_add_to_i32:
-; SI: s_load_dword s[[SREG0:[0-9]+]]
-; SI: s_load_dword s[[SREG1:[0-9]+]]
-; SI: s_add_i32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
-; SI-NOT: addc
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
 define amdgpu_kernel void @trunc_i64_add_to_i32(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b) {
+; SI-LABEL: trunc_i64_add_to_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s2, s[4:5], 0xd
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x11
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_add_i32 s4, s6, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %add = add i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index 45192be37c242..9a4040a25419a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -1,24 +1,94 @@
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI,SICI,SICIVI,PREGFX11,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=SICI,CIVI,SICIVI,PREGFX11,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=CIVI,SICIVI,GFX8PLUS,PREGFX11,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9PLUS,GFX8PLUS,PREGFX11,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11,GFX9PLUS,GFX8PLUS,GCN %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11 %s
 
-; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
-; GFX9PLUS-NOT: m0
-; SICIVI-DAG: s_mov_b32 m0
-
-; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
-; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
-; PREGFX11: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
-; GFX11: ds_cmpstore_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VSWAP]], [[VCMP]] offset:16
-; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(ptr addrspace(1) %out, [8 x i32], ptr addrspace(3) %ptr, [8 x i32], i32 %swap) nounwind {
+; CHECK-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_dword s2, s[4:5], 0x13
+; CHECK-NEXT:    s_load_dword s3, s[4:5], 0x1c
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CHECK-NEXT:    v_mov_b32_e32 v0, 7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s2
+; CHECK-NEXT:    v_mov_b32_e32 v2, s3
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s3, 0xf000
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CHECK-NEXT:    s_endpgm
+;
+; GFX7-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x13
+; GFX7-NEXT:    s_load_dword s3, s[4:5], 0x1c
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT:    v_mov_b32_e32 v0, 7
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    v_mov_b32_e32 v2, s3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x4c
+; GFX8-NEXT:    s_load_dword s3, s[4:5], 0x70
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    v_mov_b32_e32 v0, 7
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v1, v0, v2 offset:16
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x4c
+; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x70
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 7
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x4c
+; GFX11-NEXT:    s_load_b32 s1, s[4:5], 0x70
+; GFX11-NEXT:    v_mov_b32_e32 v0, 7
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v0, v1, v2, v0 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
   %pair = cmpxchg ptr addrspace(3) %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
@@ -26,24 +96,100 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(ptr addrspace(1) %o
   ret void
 }
 
-; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
-; GFX9PLUS-NOT: m0
-; SICIVI-DAG: s_mov_b32 m0
-
-; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SICI-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; GFX8PLUS-DAG: s_load_{{dwordx2|b64}} s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
-; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
-; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
-; PREGFX11: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32
-; GFX11: ds_cmpstore_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOSWAPV]]:[[HISWAPV]]], v[[[LOVCMP]]:[[HIVCMP]]] offset:32
-; GCN: [[RESULT]]
-; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i64 %swap) nounwind {
+; CHECK-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_dword s6, s[4:5], 0xb
+; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xd
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CHECK-NEXT:    v_mov_b32_e32 v0, 7
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v4, s6
+; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s3, 0xf000
+; CHECK-NEXT:    s_mov_b32 s2, -1
+; CHECK-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; CHECK-NEXT:    s_endpgm
+;
+; GFX7-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xd
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT:    v_mov_b32_e32 v0, 7
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX8-NEXT:    v_mov_b32_e32 v0, 7
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 7
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v4, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT:    v_mov_b32_e32 v0, 7
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, s2
+; GFX11-NEXT:    v_mov_b32_e32 v3, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    ds_cmpstore_rtn_b64 v[0:1], v4, v[2:3], v[0:1] offset:32
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
   %pair = cmpxchg ptr addrspace(3) %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
@@ -51,13 +197,103 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(ptr addrspace(1) %o
   ret void
 }
 
-; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
-; GFX9PLUS-NOT: m0
-; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GFX9PLUS: ds_{{cmpst|cmpstore}}_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xc
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_load_dword s3, s[4:5], 0xb
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; CHECK-NEXT:    s_sub_i32 s1, s1, s2
+; CHECK-NEXT:    s_lshl_b32 s1, s1, 2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_add_i32 s1, s3, s1
+; CHECK-NEXT:    s_add_i32 s1, s1, 16
+; CHECK-NEXT:    v_mov_b32_e32 v0, 7
+; CHECK-NEXT:    v_mov_b32_e32 v1, s0
+; CHECK-NEXT:    v_mov_b32_e32 v2, s1
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    ds_cmpst_rtn_b32 v0, v2, v0, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s7, 0xf000
+; CHECK-NEXT:    s_mov_b32 s6, -1
+; CHECK-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CHECK-NEXT:    s_endpgm
+;
+; GFX7-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX7-NEXT:    v_mov_b32_e32 v0, 7
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_sub_i32 s2, s2, s3
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    s_lshl_b32 s1, s2, 2
+; GFX7-NEXT:    s_add_i32 s0, s0, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v0, v2, v0, v1 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    v_mov_b32_e32 v0, 7
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_sub_i32 s2, s2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_lshl_b32 s1, s2, 2
+; GFX8-NEXT:    s_add_i32 s0, s0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v2, v0, v1 offset:16
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 7
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX9-NEXT:    s_add_i32 s0, s0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT:    v_mov_b32_e32 v0, 7
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s0, s0, s2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    ds_cmpstore_rtn_b32 v0, v1, v2, v0 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_endpgm
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 %add
@@ -67,45 +303,152 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac
   ret void
 }
 
-; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
-; GFX9PLUS-NOT: m0
-; SICIVI-DAG: s_mov_b32 m0
-
-
-; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x12
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x48
-; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
-; PREGFX11: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
-; GFX11: ds_cmpstore_b32 [[VPTR]], [[VSWAP]], [[VCMP]] offset:16
-; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(ptr addrspace(3) %ptr, [8 x i32], i32 %swap) nounwind {
+; CHECK-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x9
+; CHECK-NEXT:    s_load_dword s1, s[4:5], 0x12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s0
+; CHECK-NEXT:    v_mov_b32_e32 v2, s1
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    ds_cmpst_b32 v1, v0, v2 offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_endpgm
+;
+; GFX7-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x9
+; GFX7-NEXT:    s_load_dword s1, s[4:5], 0x12
+; GFX7-NEXT:    v_mov_b32_e32 v0, 7
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    ds_cmpst_b32 v1, v0, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x24
+; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x48
+; GFX8-NEXT:    v_mov_b32_e32 v0, 7
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    ds_cmpst_b32 v1, v0, v2 offset:16
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT:    s_load_dword s1, s[4:5], 0x48
+; GFX9-NEXT:    v_mov_b32_e32 v0, 7
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    ds_cmpst_b32 v1, v0, v2 offset:16
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s1, s[4:5], 0x48
+; GFX11-NEXT:    v_mov_b32_e32 v0, 7
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-NEXT:    ds_cmpstore_b32 v1, v2, v0 offset:16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
   %pair = cmpxchg ptr addrspace(3) %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
   ret void
 }
 
-; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
-; GFX9PLUS-NOT: m0
-; SICIVI-DAG: s_mov_b32 m0
-
-; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SICI-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; GFX8PLUS-DAG: s_load_{{dwordx2|b64}} s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
-; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
-; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
-; PREGFX11: ds_cmpst_b64 [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32
-; GFX11: ds_cmpstore_b64 [[VPTR]], v[[[LOSWAPV]]:[[HISWAPV]]], v[[[LOVCMP]]:[[HIVCMP]]] offset:32
-; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(ptr addrspace(3) %ptr, i64 %swap) nounwind {
+; CHECK-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_dword s2, s[4:5], 0x9
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
+; CHECK-NEXT:    v_mov_b32_e32 v0, 7
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v4, s2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_endpgm
+;
+; GFX7-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
+; GFX7-NEXT:    v_mov_b32_e32 v0, 7
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX8-NEXT:    v_mov_b32_e32 v0, 7
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v0, 7
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-NEXT:    v_mov_b32_e32 v0, 7
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, s2
+; GFX11-NEXT:    v_mov_b32_e32 v3, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    ds_cmpstore_b64 v4, v[2:3], v[0:1] offset:32
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
   %pair = cmpxchg ptr addrspace(3) %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index a9a64318cee0f..18afb7c677207 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
 ; XUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
@@ -7,123 +8,210 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) nounwind readnone
 declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone
 
-; GCN-LABEL: {{^}}test_div_fmas_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
-
-; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
-; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
-; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
-
-; GCN-DAG: s_bitcmp1_b32 s{{[0-9]+}}, 0
-
-; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]]
-; GCN: buffer_store_dword [[RESULT]],
 define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x13
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2e
+; GCN-NEXT:    s_load_dword s7, s[4:5], 0x1c
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x25
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
   store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0:
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
-; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
-; SI: buffer_store_dword [[RESULT]],
 define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f32_inline_imm_0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2e
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x1c
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x25
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    v_div_fmas_f32 v0, 1.0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
   store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-
-; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
-; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
-
-; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
-; GCN: buffer_store_dword [[RESULT]],
 define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, float %a, float %b, float %c, [8 x i32], i1 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f32_inline_imm_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x16
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0xd
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    v_div_fmas_f32 v0, v0, 1.0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
   store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
-
-; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
-; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
-
-; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
-; GCN: buffer_store_dword [[RESULT]],
 define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f32_inline_imm_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2e
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x13
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x1c
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    v_div_fmas_f32 v0, v0, v1, 1.0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
   store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f64:
-; GCN: v_div_fmas_f64
 define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0x11
+; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s8, 0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_mov_b32 s8, s0
+; GCN-NEXT:    s_mov_b32 s9, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-NEXT:    v_mov_b32_e32 v5, s7
+; GCN-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    s_endpgm
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
   store double %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
-; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
-; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c, i32 %i) nounwind {
+; GCN-LABEL: test_div_fmas_f32_cond_to_vcc:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s3, 0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %cmp = icmp eq i32 %i, 0
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
   store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
-; GCN: s_mov_b64 vcc, 0
-; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c) nounwind {
+; GCN-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_mov_b64 vcc, 0
+; GCN-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
   store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
-; GCN: s_mov_b64 vcc, -1
-; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c) nounwind {
+; GCN-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_mov_b64 vcc, -1
+; GCN-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
   store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
-
-; SI-DAG: v_cmp_eq_u32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}}
-; SI-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0{{$}}
-; SI-DAG: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
-; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
-; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0xd
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v0, v[1:2], s[4:7], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 offset:4 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 offset:8 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_cmp_lg_u32 s8, 0
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    s_and_b64 vcc, vcc, s[2:3]
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, s7
+; GCN-NEXT:    v_div_fmas_f32 v0, v0, v3, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.a = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.b = getelementptr float, ptr addrspace(1) %gep.a, i32 1
@@ -143,26 +231,39 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
   ret void
 }
 
-; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
-
-; SI: ; %entry
-; SI:     v_cmp_eq_u32_e64   [[CMP:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
-; SI:     s_mov_b64          vcc, 0
-; SI:     s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[CMP]]
-
-; SI: ; %bb
-; SI:     buffer_load_dword  [[LOAD:v[0-9]+]],
-; SI:     v_cmp_ne_u32_e32   vcc, 0, [[LOAD]]
-; SI:     s_and_b64          vcc, vcc, exec
-
-; SI: ; %exit
-; SI:     s_or_b64           exec, exec, [[SAVE]]
-; SI-NOT: vcc
-; SI:     v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI:     buffer_store_dword
-; SI:     s_endpgm
-
 define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) nounwind {
+; GCN-LABEL: test_div_fmas_f32_i1_phi_vcc:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[0:1], s[10:11]
+; GCN-NEXT:    buffer_load_dwordx2 v[1:2], v[3:4], s[0:3], 0 addr64
+; GCN-NEXT:    buffer_load_dword v3, v[3:4], s[0:3], 0 addr64 offset:8
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
+; GCN-NEXT:    s_mov_b64 vcc, 0
+; GCN-NEXT:    s_and_saveexec_b64 s[10:11], s[0:1]
+; GCN-NEXT:    s_cbranch_execz .LBB9_2
+; GCN-NEXT:  ; %bb.1: ; %bb
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s7, s3
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT:    s_and_b64 vcc, vcc, exec
+; GCN-NEXT:  .LBB9_2: ; %exit
+; GCN-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
+; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_mov_b32 s11, s3
+; GCN-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:8
+; GCN-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.out = getelementptr float, ptr addrspace(1) %out, i32 2
@@ -188,3 +289,5 @@ exit:
   store float %result, ptr addrspace(1) %gep.out, align 4
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SI: {{.*}}